| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.sql.dataframe — PySpark 3.2.2 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.sql.dataframe</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">random</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| <span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Iterable</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span> |
| <span class="kn">from</span> <span class="nn">html</span> <span class="kn">import</span> <span class="n">escape</span> <span class="k">as</span> <span class="n">html_escape</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">copy_func</span><span class="p">,</span> <span class="n">since</span><span class="p">,</span> <span class="n">_NoValue</span> |
| <span class="kn">from</span> <span class="nn">pyspark.rdd</span> <span class="kn">import</span> <span class="n">RDD</span><span class="p">,</span> <span class="n">_load_from_socket</span><span class="p">,</span> <span class="n">_local_iterator_from_socket</span> |
| <span class="kn">from</span> <span class="nn">pyspark.serializers</span> <span class="kn">import</span> <span class="n">BatchedSerializer</span><span class="p">,</span> <span class="n">PickleSerializer</span><span class="p">,</span> \ |
| <span class="n">UTF8Deserializer</span> |
| <span class="kn">from</span> <span class="nn">pyspark.storagelevel</span> <span class="kn">import</span> <span class="n">StorageLevel</span> |
| <span class="kn">from</span> <span class="nn">pyspark.traceback_utils</span> <span class="kn">import</span> <span class="n">SCCallSiteSync</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">_parse_datatype_json_string</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">,</span> <span class="n">_to_list</span><span class="p">,</span> <span class="n">_to_java_column</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.readwriter</span> <span class="kn">import</span> <span class="n">DataFrameWriter</span><span class="p">,</span> <span class="n">DataFrameWriterV2</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">DataStreamWriter</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">StructField</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">IntegerType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas.conversion</span> <span class="kn">import</span> <span class="n">PandasConversionMixin</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas.map_ops</span> <span class="kn">import</span> <span class="n">PandasMapOpsMixin</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">,</span> <span class="s2">"DataFrameNaFunctions"</span><span class="p">,</span> <span class="s2">"DataFrameStatFunctions"</span><span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="DataFrame"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame">[docs]</a><span class="k">class</span> <span class="nc">DataFrame</span><span class="p">(</span><span class="n">PandasMapOpsMixin</span><span class="p">,</span> <span class="n">PandasConversionMixin</span><span class="p">):</span> |
| <span class="sd">"""A distributed collection of data grouped into named columns.</span> |
| |
| <span class="sd"> A :class:`DataFrame` is equivalent to a relational table in Spark SQL,</span> |
| <span class="sd"> and can be created using various functions in :class:`SparkSession`::</span> |
| |
| <span class="sd"> people = spark.read.parquet("...")</span> |
| |
| <span class="sd"> Once created, it can be manipulated using the various domain-specific-language</span> |
| <span class="sd"> (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.</span> |
| |
| <span class="sd"> To select a column from the :class:`DataFrame`, use the apply method::</span> |
| |
| <span class="sd"> ageCol = people.age</span> |
| |
| <span class="sd"> A more concrete example::</span> |
| |
| <span class="sd"> # To create DataFrame using SparkSession</span> |
| <span class="sd"> people = spark.read.parquet("...")</span> |
| <span class="sd"> department = spark.read.parquet("...")</span> |
| |
| <span class="sd"> people.filter(people.age > 30).join(department, people.deptId == department.id) \\</span> |
| <span class="sd"> .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">jdf</span><span class="p">,</span> <span class="n">sql_ctx</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span> <span class="o">=</span> <span class="n">jdf</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span> <span class="o">=</span> <span class="n">sql_ctx</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span> <span class="o">=</span> <span class="n">sql_ctx</span> <span class="ow">and</span> <span class="n">sql_ctx</span><span class="o">.</span><span class="n">_sc</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_schema</span> <span class="o">=</span> <span class="kc">None</span> <span class="c1"># initialized lazily</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_lazy_rdd</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="c1"># Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice</span> |
| <span class="c1"># by __repr__ and _repr_html_ while eager evaluation opened.</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_support_repr_html</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">rdd</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns the content as an :class:`pyspark.RDD` of :class:`Row`.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lazy_rdd</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">javaToPython</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_lazy_rdd</span> <span class="o">=</span> <span class="n">RDD</span><span class="p">(</span><span class="n">jrdd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">PickleSerializer</span><span class="p">()))</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lazy_rdd</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.3.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">na</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns a :class:`DataFrameNaFunctions` for handling missing values.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrameNaFunctions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">stat</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns a :class:`DataFrameStatFunctions` for statistic functions.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrameStatFunctions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.toJSON"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.toJSON.html#pyspark.sql.DataFrame.toJSON">[docs]</a> <span class="k">def</span> <span class="nf">toJSON</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">use_unicode</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span> |
| <span class="sd">"""Converts a :class:`DataFrame` into a :class:`RDD` of string.</span> |
| |
| <span class="sd"> Each row is turned into a JSON document as one element in the returned RDD.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.toJSON().first()</span> |
| <span class="sd"> '{"age":2,"name":"Alice"}'</span> |
| <span class="sd"> """</span> |
| <span class="n">rdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">toJSON</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">RDD</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">toJavaRDD</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">UTF8Deserializer</span><span class="p">(</span><span class="n">use_unicode</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.registerTempTable"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.registerTempTable.html#pyspark.sql.DataFrame.registerTempTable">[docs]</a> <span class="k">def</span> <span class="nf">registerTempTable</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Registers this :class:`DataFrame` as a temporary table using the given name.</span> |
| |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. deprecated:: 2.0.0</span> |
| <span class="sd"> Use :meth:`DataFrame.createOrReplaceTempView` instead.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.registerTempTable("people")</span> |
| <span class="sd"> >>> df2 = spark.sql("select * from people")</span> |
| <span class="sd"> >>> sorted(df.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> spark.catalog.dropTempView("people")</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"Deprecated in 2.0, use createOrReplaceTempView instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createTempView"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.createTempView.html#pyspark.sql.DataFrame.createTempView">[docs]</a> <span class="k">def</span> <span class="nf">createTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Creates a local temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| <span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span> |
| <span class="sd"> catalog.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.createTempView("people")</span> |
| <span class="sd"> >>> df2 = spark.sql("select * from people")</span> |
| <span class="sd"> >>> sorted(df.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> AnalysisException: u"Temporary table 'people' already exists;"</span> |
| <span class="sd"> >>> spark.catalog.dropTempView("people")</span> |
| |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">createTempView</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createOrReplaceTempView"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.createOrReplaceTempView.html#pyspark.sql.DataFrame.createOrReplaceTempView">[docs]</a> <span class="k">def</span> <span class="nf">createOrReplaceTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Creates or replaces a local temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.createOrReplaceTempView("people")</span> |
| <span class="sd"> >>> df2 = df.filter(df.age > 3)</span> |
| <span class="sd"> >>> df2.createOrReplaceTempView("people")</span> |
| <span class="sd"> >>> df3 = spark.sql("select * from people")</span> |
| <span class="sd"> >>> sorted(df3.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> spark.catalog.dropTempView("people")</span> |
| |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createGlobalTempView"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.createGlobalTempView.html#pyspark.sql.DataFrame.createGlobalTempView">[docs]</a> <span class="k">def</span> <span class="nf">createGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Creates a global temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span> |
| <span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span> |
| <span class="sd"> catalog.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.createGlobalTempView("people")</span> |
| <span class="sd"> >>> df2 = spark.sql("select * from global_temp.people")</span> |
| <span class="sd"> >>> sorted(df.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> AnalysisException: u"Temporary table 'people' already exists;"</span> |
| <span class="sd"> >>> spark.catalog.dropGlobalTempView("people")</span> |
| |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">createGlobalTempView</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createOrReplaceGlobalTempView"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.createOrReplaceGlobalTempView.html#pyspark.sql.DataFrame.createOrReplaceGlobalTempView">[docs]</a> <span class="k">def</span> <span class="nf">createOrReplaceGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Creates or replaces a global temporary view using the given name.</span> |
| |
| <span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.createOrReplaceGlobalTempView("people")</span> |
| <span class="sd"> >>> df2 = df.filter(df.age > 3)</span> |
| <span class="sd"> >>> df2.createOrReplaceGlobalTempView("people")</span> |
| <span class="sd"> >>> df3 = spark.sql("select * from global_temp.people")</span> |
| <span class="sd"> >>> sorted(df3.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> spark.catalog.dropGlobalTempView("people")</span> |
| |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">createOrReplaceGlobalTempView</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Interface for saving the content of the non-streaming :class:`DataFrame` out into external</span> |
| <span class="sd"> storage.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameWriter`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrameWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">writeStream</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Interface for saving the content of the streaming :class:`DataFrame` out into external</span> |
| <span class="sd"> storage.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataStreamWriter`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataStreamWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.schema</span> |
| <span class="sd"> StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_schema</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_schema</span> <span class="o">=</span> <span class="n">_parse_datatype_json_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">schema</span><span class="p">()</span><span class="o">.</span><span class="n">json</span><span class="p">())</span> |
| <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Unable to parse datatype from schema. </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">e</span><span class="p">)</span> <span class="kn">from</span> <span class="nn">e</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_schema</span> |
| |
| <div class="viewcode-block" id="DataFrame.printSchema"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.printSchema.html#pyspark.sql.DataFrame.printSchema">[docs]</a> <span class="k">def</span> <span class="nf">printSchema</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Prints out the schema in the tree format.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- age: integer (nullable = true)</span> |
| <span class="sd"> |-- name: string (nullable = true)</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> """</span> |
| <span class="nb">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">schema</span><span class="p">()</span><span class="o">.</span><span class="n">treeString</span><span class="p">())</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.explain"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.explain.html#pyspark.sql.DataFrame.explain">[docs]</a> <span class="k">def</span> <span class="nf">explain</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">extended</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Prints the (logical and physical) plans to the console for debugging purpose.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> extended : bool, optional</span> |
| <span class="sd"> default ``False``. If ``False``, prints only the physical plan.</span> |
| <span class="sd"> When this is a string without specifying the ``mode``, it works as the mode is</span> |
| <span class="sd"> specified.</span> |
| <span class="sd"> mode : str, optional</span> |
| <span class="sd"> specifies the expected output format of plans.</span> |
| |
| <span class="sd"> * ``simple``: Print only a physical plan.</span> |
| <span class="sd"> * ``extended``: Print both logical and physical plans.</span> |
| <span class="sd"> * ``codegen``: Print a physical plan and generated codes if they are available.</span> |
| <span class="sd"> * ``cost``: Print a logical plan and statistics if they are available.</span> |
| <span class="sd"> * ``formatted``: Split explain output into two sections: a physical plan outline \</span> |
| <span class="sd"> and node details.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0.0</span> |
| <span class="sd"> Added optional argument `mode` to specify the expected output format of plans.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.explain()</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> *(1) Scan ExistingRDD[age#0,name#1]</span> |
| |
| <span class="sd"> >>> df.explain(True)</span> |
| <span class="sd"> == Parsed Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Analyzed Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Optimized Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> ...</span> |
| |
| <span class="sd"> >>> df.explain(mode="formatted")</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> * Scan ExistingRDD (1)</span> |
| <span class="sd"> (1) Scan ExistingRDD [codegen id : 1]</span> |
| <span class="sd"> Output [2]: [age#0, name#1]</span> |
| <span class="sd"> ...</span> |
| |
| <span class="sd"> >>> df.explain("cost")</span> |
| <span class="sd"> == Optimized Logical Plan ==</span> |
| <span class="sd"> ...Statistics...</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="n">extended</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">mode</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"extended and mode should not be set together."</span><span class="p">)</span> |
| |
| <span class="c1"># For the no argument case: df.explain()</span> |
| <span class="n">is_no_argument</span> <span class="o">=</span> <span class="n">extended</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="c1"># For the cases below:</span> |
| <span class="c1"># explain(True)</span> |
| <span class="c1"># explain(extended=False)</span> |
| <span class="n">is_extended_case</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">extended</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="ow">and</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="c1"># For the case when extended is mode:</span> |
| <span class="c1"># df.explain("formatted")</span> |
| <span class="n">is_extended_as_mode</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">extended</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="c1"># For the mode specified:</span> |
| <span class="c1"># df.explain(mode="formatted")</span> |
| <span class="n">is_mode_case</span> <span class="o">=</span> <span class="n">extended</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">mode</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">is_no_argument</span> <span class="ow">or</span> <span class="n">is_extended_case</span> <span class="ow">or</span> <span class="n">is_extended_as_mode</span> <span class="ow">or</span> <span class="n">is_mode_case</span><span class="p">):</span> |
| <span class="n">argtypes</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">str</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="p">[</span><span class="n">extended</span><span class="p">,</span> <span class="n">mode</span><span class="p">]</span> <span class="k">if</span> <span class="n">arg</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"extended (optional) and mode (optional) should be a string "</span> |
| <span class="s2">"and bool; however, got [</span><span class="si">%s</span><span class="s2">]."</span> <span class="o">%</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">argtypes</span><span class="p">))</span> |
| |
| <span class="c1"># Sets an explain mode depending on a given argument</span> |
| <span class="k">if</span> <span class="n">is_no_argument</span><span class="p">:</span> |
| <span class="n">explain_mode</span> <span class="o">=</span> <span class="s2">"simple"</span> |
| <span class="k">elif</span> <span class="n">is_extended_case</span><span class="p">:</span> |
| <span class="n">explain_mode</span> <span class="o">=</span> <span class="s2">"extended"</span> <span class="k">if</span> <span class="n">extended</span> <span class="k">else</span> <span class="s2">"simple"</span> |
| <span class="k">elif</span> <span class="n">is_mode_case</span><span class="p">:</span> |
| <span class="n">explain_mode</span> <span class="o">=</span> <span class="n">mode</span> |
| <span class="k">elif</span> <span class="n">is_extended_as_mode</span><span class="p">:</span> |
| <span class="n">explain_mode</span> <span class="o">=</span> <span class="n">extended</span> |
| |
| <span class="nb">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonSQLUtils</span><span class="o">.</span><span class="n">explainString</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">queryExecution</span><span class="p">(),</span> <span class="n">explain_mode</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.exceptAll"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.exceptAll.html#pyspark.sql.DataFrame.exceptAll">[docs]</a> <span class="k">def</span> <span class="nf">exceptAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but</span> |
| <span class="sd"> not in another :class:`DataFrame` while preserving duplicates.</span> |
| |
| <span class="sd"> This is equivalent to `EXCEPT ALL` in SQL.</span> |
| <span class="sd"> As standard in SQL, this function resolves columns by position (not by name).</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame(</span> |
| <span class="sd"> ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| |
| <span class="sd"> >>> df1.exceptAll(df2).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | c| 4|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">exceptAll</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.isLocal"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.isLocal.html#pyspark.sql.DataFrame.isLocal">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">isLocal</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally</span> |
| <span class="sd"> (without any Spark executors).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">isLocal</span><span class="p">()</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">isStreaming</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns ``True`` if this :class:`DataFrame` contains one or more sources that</span> |
| <span class="sd"> continuously return data as it arrives. A :class:`DataFrame` that reads data from a</span> |
| <span class="sd"> streaming source must be executed as a :class:`StreamingQuery` using the :func:`start`</span> |
| <span class="sd"> method in :class:`DataStreamWriter`. Methods that return a single answer, (e.g.,</span> |
| <span class="sd"> :func:`count` or :func:`collect`) will throw an :class:`AnalysisException` when there</span> |
| <span class="sd"> is a streaming source present.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">isStreaming</span><span class="p">()</span> |
| |
| <div class="viewcode-block" id="DataFrame.show"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.show.html#pyspark.sql.DataFrame.show">[docs]</a> <span class="k">def</span> <span class="nf">show</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">truncate</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">vertical</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="sd">"""Prints the first ``n`` rows to the console.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, optional</span> |
| <span class="sd"> Number of rows to show.</span> |
| <span class="sd"> truncate : bool or int, optional</span> |
| <span class="sd"> If set to ``True``, truncate strings longer than 20 chars by default.</span> |
| <span class="sd"> If set to a number greater than one, truncates long strings to length ``truncate``</span> |
| <span class="sd"> and align cells right.</span> |
| <span class="sd"> vertical : bool, optional</span> |
| <span class="sd"> If set to ``True``, print output rows vertically (one line</span> |
| <span class="sd"> per column value).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> DataFrame[age: int, name: string]</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> df.show(truncate=3)</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 2| Ali|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> >>> df.show(vertical=True)</span> |
| <span class="sd"> -RECORD 0-----</span> |
| <span class="sd"> age | 2</span> |
| <span class="sd"> name | Alice</span> |
| <span class="sd"> -RECORD 1-----</span> |
| <span class="sd"> age | 5</span> |
| <span class="sd"> name | Bob</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">bool</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Parameter 'n' (number of rows) must be an int"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">vertical</span><span class="p">,</span> <span class="nb">bool</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Parameter 'vertical' must be a bool"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">truncate</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="ow">and</span> <span class="n">truncate</span><span class="p">:</span> |
| <span class="nb">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">showString</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="n">vertical</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">int_truncate</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">truncate</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Parameter 'truncate=</span><span class="si">{}</span><span class="s2">' should be either bool or int."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">truncate</span><span class="p">))</span> |
| |
| <span class="nb">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">showString</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">int_truncate</span><span class="p">,</span> <span class="n">vertical</span><span class="p">))</span></div> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_support_repr_html</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">isReplEagerEvalEnabled</span><span class="p">():</span> |
| <span class="n">vertical</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">showString</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">replEagerEvalMaxNumRows</span><span class="p">(),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">replEagerEvalTruncate</span><span class="p">(),</span> <span class="n">vertical</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s2">"DataFrame[</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="p">(</span><span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2">: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns a :class:`DataFrame` with html code when you enabled eager evaluation</span> |
| <span class="sd"> by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are</span> |
| <span class="sd"> using support eager evaluation with HTML.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_support_repr_html</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_support_repr_html</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">isReplEagerEvalEnabled</span><span class="p">():</span> |
| <span class="n">max_num_rows</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">replEagerEvalMaxNumRows</span><span class="p">(),</span> <span class="mi">0</span><span class="p">)</span> |
| <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">getRowsToPython</span><span class="p">(</span> |
| <span class="n">max_num_rows</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">replEagerEvalTruncate</span><span class="p">())</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">PickleSerializer</span><span class="p">())))</span> |
| <span class="n">head</span> <span class="o">=</span> <span class="n">rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">row_data</span> <span class="o">=</span> <span class="n">rows</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span> |
| <span class="n">has_more_data</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">row_data</span><span class="p">)</span> <span class="o">></span> <span class="n">max_num_rows</span> |
| <span class="n">row_data</span> <span class="o">=</span> <span class="n">row_data</span><span class="p">[:</span><span class="n">max_num_rows</span><span class="p">]</span> |
| |
| <span class="n">html</span> <span class="o">=</span> <span class="s2">"<table border='1'></span><span class="se">\n</span><span class="s2">"</span> |
| <span class="c1"># generate table head</span> |
| <span class="n">html</span> <span class="o">+=</span> <span class="s2">"<tr><th></span><span class="si">%s</span><span class="s2"></th></tr></span><span class="se">\n</span><span class="s2">"</span> <span class="o">%</span> <span class="s2">"</th><th>"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">html_escape</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">head</span><span class="p">))</span> |
| <span class="c1"># generate table rows</span> |
| <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">row_data</span><span class="p">:</span> |
| <span class="n">html</span> <span class="o">+=</span> <span class="s2">"<tr><td></span><span class="si">%s</span><span class="s2"></td></tr></span><span class="se">\n</span><span class="s2">"</span> <span class="o">%</span> <span class="s2">"</td><td>"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> |
| <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">html_escape</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">row</span><span class="p">))</span> |
| <span class="n">html</span> <span class="o">+=</span> <span class="s2">"</table></span><span class="se">\n</span><span class="s2">"</span> |
| <span class="k">if</span> <span class="n">has_more_data</span><span class="p">:</span> |
| <span class="n">html</span> <span class="o">+=</span> <span class="s2">"only showing top </span><span class="si">%d</span><span class="s2"> </span><span class="si">%s</span><span class="se">\n</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span> |
| <span class="n">max_num_rows</span><span class="p">,</span> <span class="s2">"row"</span> <span class="k">if</span> <span class="n">max_num_rows</span> <span class="o">==</span> <span class="mi">1</span> <span class="k">else</span> <span class="s2">"rows"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">html</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <div class="viewcode-block" id="DataFrame.checkpoint"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.checkpoint.html#pyspark.sql.DataFrame.checkpoint">[docs]</a> <span class="k">def</span> <span class="nf">checkpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span> |
| <span class="sd">"""Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be used to</span> |
| <span class="sd"> truncate the logical plan of this :class:`DataFrame`, which is especially useful in</span> |
| <span class="sd"> iterative algorithms where the plan may grow exponentially. It will be saved to files</span> |
| <span class="sd"> inside the checkpoint directory set with :meth:`SparkContext.setCheckpointDir`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eager : bool, optional</span> |
| <span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is experimental.</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">checkpoint</span><span class="p">(</span><span class="n">eager</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.localCheckpoint"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.localCheckpoint.html#pyspark.sql.DataFrame.localCheckpoint">[docs]</a> <span class="k">def</span> <span class="nf">localCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span> |
| <span class="sd">"""Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can be</span> |
| <span class="sd"> used to truncate the logical plan of this :class:`DataFrame`, which is especially useful in</span> |
| <span class="sd"> iterative algorithms where the plan may grow exponentially. Local checkpoints are</span> |
| <span class="sd"> stored in the executors using the caching subsystem and therefore they are not reliable.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eager : bool, optional</span> |
| <span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is experimental.</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">localCheckpoint</span><span class="p">(</span><span class="n">eager</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withWatermark"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.withWatermark.html#pyspark.sql.DataFrame.withWatermark">[docs]</a> <span class="k">def</span> <span class="nf">withWatermark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eventTime</span><span class="p">,</span> <span class="n">delayThreshold</span><span class="p">):</span> |
| <span class="sd">"""Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point</span> |
| <span class="sd"> in time before which we assume no more late data is going to arrive.</span> |
| |
| <span class="sd"> Spark will use this watermark for several purposes:</span> |
| <span class="sd"> - To know when a given time window aggregation can be finalized and thus can be emitted</span> |
| <span class="sd"> when using output modes that do not allow updates.</span> |
| |
| <span class="sd"> - To minimize the amount of state that we need to keep for on-going aggregations.</span> |
| |
| <span class="sd"> The current watermark is computed by looking at the `MAX(eventTime)` seen across</span> |
| <span class="sd"> all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost</span> |
| <span class="sd"> of coordinating this value across partitions, the actual watermark used is only guaranteed</span> |
| <span class="sd"> to be at least `delayThreshold` behind the actual event time. In some cases we may still</span> |
| <span class="sd"> process records that arrive more than `delayThreshold` late.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eventTime : str</span> |
| <span class="sd"> the name of the column that contains the event time of the row.</span> |
| <span class="sd"> delayThreshold : str</span> |
| <span class="sd"> the minimum delay to wait to data to arrive late, relative to the</span> |
| <span class="sd"> latest record that has been processed in the form of an interval</span> |
| <span class="sd"> (e.g. "1 minute" or "5 hours").</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import timestamp_seconds</span> |
| <span class="sd"> >>> sdf.select(</span> |
| <span class="sd"> ... 'name',</span> |
| <span class="sd"> ... timestamp_seconds(sdf.time).alias('time')).withWatermark('time', '10 minutes')</span> |
| <span class="sd"> DataFrame[name: string, time: timestamp]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">eventTime</span> <span class="ow">or</span> <span class="nb">type</span><span class="p">(</span><span class="n">eventTime</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="nb">str</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"eventTime should be provided as a string"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">delayThreshold</span> <span class="ow">or</span> <span class="nb">type</span><span class="p">(</span><span class="n">delayThreshold</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="nb">str</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"delayThreshold should be provided as a string interval"</span><span class="p">)</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">withWatermark</span><span class="p">(</span><span class="n">eventTime</span><span class="p">,</span> <span class="n">delayThreshold</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.hint"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.hint.html#pyspark.sql.DataFrame.hint">[docs]</a> <span class="k">def</span> <span class="nf">hint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="n">parameters</span><span class="p">):</span> |
| <span class="sd">"""Specifies some hint on the current :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> A name of the hint.</span> |
| <span class="sd"> parameters : str, list, float or int</span> |
| <span class="sd"> Optional parameters.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.join(df2.hint("broadcast"), "name").show()</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> |name|age|height|</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> | Bob| 5| 85|</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">parameters</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"name should be provided as str, got </span><span class="si">{0}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">name</span><span class="p">)))</span> |
| |
| <span class="n">allowed_types</span> <span class="o">=</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">parameters</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">p</span><span class="p">,</span> <span class="n">allowed_types</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"all parameters should be in </span><span class="si">{0}</span><span class="s2">, got </span><span class="si">{1}</span><span class="s2"> of type </span><span class="si">{2}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">allowed_types</span><span class="p">,</span> <span class="n">p</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">p</span><span class="p">)))</span> |
| |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">hint</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">parameters</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.count"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.count.html#pyspark.sql.DataFrame.count">[docs]</a> <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns the number of rows in this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">count</span><span class="p">())</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.collect"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.collect.html#pyspark.sql.DataFrame.collect">[docs]</a> <span class="k">def</span> <span class="nf">collect</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns all the records as a list of :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">)</span> <span class="k">as</span> <span class="n">css</span><span class="p">:</span> |
| <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">collectToPython</span><span class="p">()</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">PickleSerializer</span><span class="p">())))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toLocalIterator"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.toLocalIterator.html#pyspark.sql.DataFrame.toLocalIterator">[docs]</a> <span class="k">def</span> <span class="nf">toLocalIterator</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prefetchPartitions</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns an iterator that contains all of the rows in this :class:`DataFrame`.</span> |
| <span class="sd"> The iterator will consume as much memory as the largest partition in this</span> |
| <span class="sd"> :class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest</span> |
| <span class="sd"> partitions.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> prefetchPartitions : bool, optional</span> |
| <span class="sd"> If Spark should pre-fetch the next partition before it is needed.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> list(df.toLocalIterator())</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">)</span> <span class="k">as</span> <span class="n">css</span><span class="p">:</span> |
| <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">toPythonIterator</span><span class="p">(</span><span class="n">prefetchPartitions</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_local_iterator_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">PickleSerializer</span><span class="p">()))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.limit"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.limit.html#pyspark.sql.DataFrame.limit">[docs]</a> <span class="k">def</span> <span class="nf">limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span> |
| <span class="sd">"""Limits the result count to the number specified.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.limit(1).collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> df.limit(0).collect()</span> |
| <span class="sd"> []</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">num</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.take"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.take.html#pyspark.sql.DataFrame.take">[docs]</a> <span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span> |
| <span class="sd">"""Returns the first ``num`` rows as a :class:`list` of :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.take(2)</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">num</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.tail"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.tail.html#pyspark.sql.DataFrame.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns the last ``num`` rows as a :class:`list` of :class:`Row`.</span> |
| |
| <span class="sd"> Running tail requires moving data into the application's driver process, and doing so with</span> |
| <span class="sd"> a very large ``num`` can crash the driver process with OutOfMemoryError.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.tail(1)</span> |
| <span class="sd"> [Row(age=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">):</span> |
| <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">tailToPython</span><span class="p">(</span><span class="n">num</span><span class="p">)</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">PickleSerializer</span><span class="p">())))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.foreach"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.foreach.html#pyspark.sql.DataFrame.foreach">[docs]</a> <span class="k">def</span> <span class="nf">foreach</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span> |
| <span class="sd">"""Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is a shorthand for ``df.rdd.foreach()``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def f(person):</span> |
| <span class="sd"> ... print(person.name)</span> |
| <span class="sd"> >>> df.foreach(f)</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">foreach</span><span class="p">(</span><span class="n">f</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.foreachPartition"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.foreachPartition.html#pyspark.sql.DataFrame.foreachPartition">[docs]</a> <span class="k">def</span> <span class="nf">foreachPartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span> |
| <span class="sd">"""Applies the ``f`` function to each partition of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> This a shorthand for ``df.rdd.foreachPartition()``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def f(people):</span> |
| <span class="sd"> ... for person in people:</span> |
| <span class="sd"> ... print(person.name)</span> |
| <span class="sd"> >>> df.foreachPartition(f)</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">foreachPartition</span><span class="p">(</span><span class="n">f</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.cache"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.cache.html#pyspark.sql.DataFrame.cache">[docs]</a> <span class="k">def</span> <span class="nf">cache</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK`).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The default storage level has changed to `MEMORY_AND_DISK` to match Scala in 2.0.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.persist"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.persist.html#pyspark.sql.DataFrame.persist">[docs]</a> <span class="k">def</span> <span class="nf">persist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">storageLevel</span><span class="o">=</span><span class="n">StorageLevel</span><span class="o">.</span><span class="n">MEMORY_AND_DISK_DESER</span><span class="p">):</span> |
| <span class="sd">"""Sets the storage level to persist the contents of the :class:`DataFrame` across</span> |
| <span class="sd"> operations after the first time it is computed. This can only be used to assign</span> |
| <span class="sd"> a new storage level if the :class:`DataFrame` does not have a storage level set yet.</span> |
| <span class="sd"> If no storage level is specified defaults to (`MEMORY_AND_DISK_DESER`)</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">javaStorageLevel</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_getJavaStorageLevel</span><span class="p">(</span><span class="n">storageLevel</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">persist</span><span class="p">(</span><span class="n">javaStorageLevel</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">storageLevel</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Get the :class:`DataFrame`'s current storage level.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.storageLevel</span> |
| <span class="sd"> StorageLevel(False, False, False, False, 1)</span> |
| <span class="sd"> >>> df.cache().storageLevel</span> |
| <span class="sd"> StorageLevel(True, True, False, True, 1)</span> |
| <span class="sd"> >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel</span> |
| <span class="sd"> StorageLevel(True, False, False, False, 2)</span> |
| <span class="sd"> """</span> |
| <span class="n">java_storage_level</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">storageLevel</span><span class="p">()</span> |
| <span class="n">storage_level</span> <span class="o">=</span> <span class="n">StorageLevel</span><span class="p">(</span><span class="n">java_storage_level</span><span class="o">.</span><span class="n">useDisk</span><span class="p">(),</span> |
| <span class="n">java_storage_level</span><span class="o">.</span><span class="n">useMemory</span><span class="p">(),</span> |
| <span class="n">java_storage_level</span><span class="o">.</span><span class="n">useOffHeap</span><span class="p">(),</span> |
| <span class="n">java_storage_level</span><span class="o">.</span><span class="n">deserialized</span><span class="p">(),</span> |
| <span class="n">java_storage_level</span><span class="o">.</span><span class="n">replication</span><span class="p">())</span> |
| <span class="k">return</span> <span class="n">storage_level</span> |
| |
| <div class="viewcode-block" id="DataFrame.unpersist"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.unpersist.html#pyspark.sql.DataFrame.unpersist">[docs]</a> <span class="k">def</span> <span class="nf">unpersist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">blocking</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="sd">"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from</span> |
| <span class="sd"> memory and disk.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `blocking` default has changed to ``False`` to match Scala in 2.0.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">unpersist</span><span class="p">(</span><span class="n">blocking</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.coalesce"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.coalesce.html#pyspark.sql.DataFrame.coalesce">[docs]</a> <span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.</span> |
| |
| <span class="sd"> Similar to coalesce defined on an :class:`RDD`, this operation results in a</span> |
| <span class="sd"> narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,</span> |
| <span class="sd"> there will not be a shuffle, instead each of the 100 new partitions will</span> |
| <span class="sd"> claim 10 of the current partitions. If a larger number of partitions is requested,</span> |
| <span class="sd"> it will stay at the current number of partitions.</span> |
| |
| <span class="sd"> However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,</span> |
| <span class="sd"> this may result in your computation taking place on fewer nodes than</span> |
| <span class="sd"> you like (e.g. one node in the case of numPartitions = 1). To avoid this,</span> |
| <span class="sd"> you can call repartition(). This will add a shuffle step, but means the</span> |
| <span class="sd"> current upstream partitions will be executed in parallel (per whatever</span> |
| <span class="sd"> the current partitioning is).</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> specify the target number of partitions</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.coalesce(1).rdd.getNumPartitions()</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.repartition"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.repartition.html#pyspark.sql.DataFrame.repartition">[docs]</a> <span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span> |
| <span class="sd"> resulting :class:`DataFrame` is hash partitioned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> can be an int to specify the target number of partitions or a Column.</span> |
| <span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span> |
| <span class="sd"> the default number of partitions is used.</span> |
| <span class="sd"> cols : str or :class:`Column`</span> |
| <span class="sd"> partitioning columns.</span> |
| |
| <span class="sd"> .. versionchanged:: 1.6</span> |
| <span class="sd"> Added optional arguments to specify the partitioning columns. Also made numPartitions</span> |
| <span class="sd"> optional if partitioning columns are specified.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.repartition(10).rdd.getNumPartitions()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> data = df.union(df).repartition("age")</span> |
| <span class="sd"> >>> data.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> data = data.repartition(7, "age")</span> |
| <span class="sd"> >>> data.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> data.rdd.getNumPartitions()</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> >>> data = data.repartition(3, "name", "age")</span> |
| <span class="sd"> >>> data.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="p">)</span> <span class="o">+</span> <span class="n">cols</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"numPartitions should be an int or Column"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.repartitionByRange"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.repartitionByRange.html#pyspark.sql.DataFrame.repartitionByRange">[docs]</a> <span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span> |
| <span class="sd"> resulting :class:`DataFrame` is range partitioned.</span> |
| |
| <span class="sd"> At least one partition-by expression must be specified.</span> |
| <span class="sd"> When no explicit sort order is specified, "ascending nulls first" is assumed.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> can be an int to specify the target number of partitions or a Column.</span> |
| <span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span> |
| <span class="sd"> the default number of partitions is used.</span> |
| <span class="sd"> cols : str or :class:`Column`</span> |
| <span class="sd"> partitioning columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Due to performance reasons this method uses sampling to estimate the ranges.</span> |
| <span class="sd"> Hence, the output may not be consistent, since sampling can return different values.</span> |
| <span class="sd"> The sample size can be controlled by the config</span> |
| <span class="sd"> `spark.sql.execution.rangeExchange.sampleSizePerPartition`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.repartitionByRange(2, "age").rdd.getNumPartitions()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> df.repartitionByRange(1, "age").rdd.getNumPartitions()</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> >>> data = df.repartitionByRange("age")</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"At least one partition-by expression must be specified."</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">repartitionByRange</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">(</span><span class="n">numPartitions</span><span class="p">,)</span> <span class="o">+</span> <span class="n">cols</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"numPartitions should be an int, string or Column"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.distinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.distinct.html#pyspark.sql.DataFrame.distinct">[docs]</a> <span class="k">def</span> <span class="nf">distinct</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.distinct().count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">distinct</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sample"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.sample.html#pyspark.sql.DataFrame.sample">[docs]</a> <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">withReplacement</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">fraction</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Returns a sampled subset of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> withReplacement : bool, optional</span> |
| <span class="sd"> Sample with replacement or not (default ``False``).</span> |
| <span class="sd"> fraction : float, optional</span> |
| <span class="sd"> Fraction of rows to generate, range [0.0, 1.0].</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Seed for sampling (default a random seed).</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is not guaranteed to provide exactly the fraction specified of the total</span> |
| <span class="sd"> count of the given :class:`DataFrame`.</span> |
| |
| <span class="sd"> `fraction` is required and, `withReplacement` and `seed` are optional.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.sample(0.5, 3).count()</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> >>> df.sample(fraction=0.5, seed=3).count()</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count()</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> >>> df.sample(1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> df.sample(fraction=1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> df.sample(False, fraction=1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># For the cases below:</span> |
| <span class="c1"># sample(True, 0.5 [, seed])</span> |
| <span class="c1"># sample(True, fraction=0.5 [, seed])</span> |
| <span class="c1"># sample(withReplacement=False, fraction=0.5 [, seed])</span> |
| <span class="n">is_withReplacement_set</span> <span class="o">=</span> \ |
| <span class="nb">type</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">)</span> <span class="o">==</span> <span class="nb">bool</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fraction</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> |
| |
| <span class="c1"># For the case below:</span> |
| <span class="c1"># sample(faction=0.5 [, seed])</span> |
| <span class="n">is_withReplacement_omitted_kwargs</span> <span class="o">=</span> \ |
| <span class="n">withReplacement</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fraction</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> |
| |
| <span class="c1"># For the case below:</span> |
| <span class="c1"># sample(0.5 [, seed])</span> |
| <span class="n">is_withReplacement_omitted_args</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">is_withReplacement_set</span> |
| <span class="ow">or</span> <span class="n">is_withReplacement_omitted_kwargs</span> |
| <span class="ow">or</span> <span class="n">is_withReplacement_omitted_args</span><span class="p">):</span> |
| <span class="n">argtypes</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">str</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="p">[</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="n">seed</span><span class="p">]</span> <span class="k">if</span> <span class="n">arg</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"withReplacement (optional), fraction (required) and seed (optional)"</span> |
| <span class="s2">" should be a bool, float and number; however, "</span> |
| <span class="s2">"got [</span><span class="si">%s</span><span class="s2">]."</span> <span class="o">%</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">argtypes</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">is_withReplacement_omitted_args</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">fraction</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">seed</span> <span class="o">=</span> <span class="n">fraction</span> |
| <span class="n">fraction</span> <span class="o">=</span> <span class="n">withReplacement</span> |
| <span class="n">withReplacement</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">seed</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span> <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="p">[</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="n">seed</span><span class="p">]</span> <span class="k">if</span> <span class="n">arg</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sampleBy"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.sampleBy.html#pyspark.sql.DataFrame.sampleBy">[docs]</a> <span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">fractions</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a stratified sample without replacement based on the</span> |
| <span class="sd"> fraction given on each stratum.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`Column` or str</span> |
| <span class="sd"> column that defines strata</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0</span> |
| <span class="sd"> Added sampling by a column of :class:`Column`</span> |
| <span class="sd"> fractions : dict</span> |
| <span class="sd"> sampling fraction for each stratum. If a stratum is not</span> |
| <span class="sd"> specified, we treat its fraction as zero.</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> random seed</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> a new :class:`DataFrame` that represents the stratified sample</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))</span> |
| <span class="sd"> >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)</span> |
| <span class="sd"> >>> sampled.groupBy("key").count().orderBy("key").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |key|count|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 0| 3|</span> |
| <span class="sd"> | 1| 6|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()</span> |
| <span class="sd"> 33</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">Column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col must be a string or a column, but got </span><span class="si">%r</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fractions</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"fractions must be a dict but got </span><span class="si">%r</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">fractions</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">fractions</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"key must be float, int, or string, but got </span><span class="si">%r</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">k</span><span class="p">))</span> |
| <span class="n">fractions</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">_jc</span> |
| <span class="n">seed</span> <span class="o">=</span> <span class="n">seed</span> <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">sampleBy</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jmap</span><span class="p">(</span><span class="n">fractions</span><span class="p">),</span> <span class="n">seed</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.randomSplit"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.randomSplit.html#pyspark.sql.DataFrame.randomSplit">[docs]</a> <span class="k">def</span> <span class="nf">randomSplit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">weights</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Randomly splits this :class:`DataFrame` with the provided weights.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> weights : list</span> |
| <span class="sd"> list of doubles as weights with which to split the :class:`DataFrame`.</span> |
| <span class="sd"> Weights will be normalized if they don't sum up to 1.0.</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> The seed for sampling.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> splits = df4.randomSplit([1.0, 2.0], 24)</span> |
| <span class="sd"> >>> splits[0].count()</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> >>> splits[1].count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">weights</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">w</span> <span class="o"><</span> <span class="mf">0.0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Weights must be positive. Found weight value: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">w</span><span class="p">)</span> |
| <span class="n">seed</span> <span class="o">=</span> <span class="n">seed</span> <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">)</span> |
| <span class="n">rdd_array</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">randomSplit</span><span class="p">(</span><span class="n">_to_list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">weights</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">seed</span><span class="p">))</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> <span class="k">for</span> <span class="n">rdd</span> <span class="ow">in</span> <span class="n">rdd_array</span><span class="p">]</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns all column names and their data types as a list.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> [('age', 'int'), ('name', 'string')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">[(</span><span class="nb">str</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">name</span><span class="p">),</span> <span class="n">f</span><span class="o">.</span><span class="n">dataType</span><span class="o">.</span><span class="n">simpleString</span><span class="p">())</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">fields</span><span class="p">]</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">columns</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns all column names as a list.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.columns</span> |
| <span class="sd"> ['age', 'name']</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">f</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">fields</span><span class="p">]</span> |
| |
| <div class="viewcode-block" id="DataFrame.colRegex"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.colRegex.html#pyspark.sql.DataFrame.colRegex">[docs]</a> <span class="k">def</span> <span class="nf">colRegex</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Selects column based on the column name specified as a regex and returns it</span> |
| <span class="sd"> as :class:`Column`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colName : str</span> |
| <span class="sd"> string, column name specified as a regex.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])</span> |
| <span class="sd"> >>> df.select(df.colRegex("`(Col1)?+.+`")).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |Col2|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">colName</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"colName should be provided as string"</span><span class="p">)</span> |
| <span class="n">jc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">colRegex</span><span class="p">(</span><span class="n">colName</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.alias"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.alias.html#pyspark.sql.DataFrame.alias">[docs]</a> <span class="k">def</span> <span class="nf">alias</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alias</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` with an alias set.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> alias : str</span> |
| <span class="sd"> an alias name to be set for the :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import *</span> |
| <span class="sd"> >>> df_as1 = df.alias("df_as1")</span> |
| <span class="sd"> >>> df_as2 = df.alias("df_as2")</span> |
| <span class="sd"> >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')</span> |
| <span class="sd"> >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age") \</span> |
| <span class="sd"> .sort(desc("df_as1.name")).collect()</span> |
| <span class="sd"> [Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)]</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">alias</span><span class="p">,</span> <span class="nb">str</span><span class="p">),</span> <span class="s2">"alias should be a string"</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="p">,</span> <span class="s2">"as"</span><span class="p">)(</span><span class="n">alias</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.crossJoin"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.crossJoin.html#pyspark.sql.DataFrame.crossJoin">[docs]</a> <span class="k">def</span> <span class="nf">crossJoin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""Returns the cartesian product with another :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Right side of the cartesian product.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.select("age", "name").collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df2.select("name", "height").collect()</span> |
| <span class="sd"> [Row(name='Tom', height=80), Row(name='Bob', height=85)]</span> |
| <span class="sd"> >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85),</span> |
| <span class="sd"> Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">crossJoin</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.join"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.join.html#pyspark.sql.DataFrame.join">[docs]</a> <span class="k">def</span> <span class="nf">join</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Joins with another :class:`DataFrame`, using the given join expression.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Right side of the join</span> |
| <span class="sd"> on : str, list or :class:`Column`, optional</span> |
| <span class="sd"> a string for the join column name, a list of column names,</span> |
| <span class="sd"> a join expression (Column), or a list of Columns.</span> |
| <span class="sd"> If `on` is a string or a list of strings indicating the name of the join column(s),</span> |
| <span class="sd"> the column(s) must exist on both sides, and this performs an equi-join.</span> |
| <span class="sd"> how : str, optional</span> |
| <span class="sd"> default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,</span> |
| <span class="sd"> ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,</span> |
| <span class="sd"> ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,</span> |
| <span class="sd"> ``anti``, ``leftanti`` and ``left_anti``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> The following performs a full outer join between ``df1`` and ``df2``.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import desc</span> |
| <span class="sd"> >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height) \</span> |
| <span class="sd"> .sort(desc("name")).collect()</span> |
| <span class="sd"> [Row(name='Bob', height=85), Row(name='Alice', height=None), Row(name=None, height=80)]</span> |
| |
| <span class="sd"> >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).collect()</span> |
| <span class="sd"> [Row(name='Tom', height=80), Row(name='Bob', height=85), Row(name='Alice', height=None)]</span> |
| |
| <span class="sd"> >>> cond = [df.name == df3.name, df.age == df3.age]</span> |
| <span class="sd"> >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()</span> |
| <span class="sd"> [Row(name='Alice', age=2), Row(name='Bob', age=5)]</span> |
| |
| <span class="sd"> >>> df.join(df2, 'name').select(df.name, df2.height).collect()</span> |
| <span class="sd"> [Row(name='Bob', height=85)]</span> |
| |
| <span class="sd"> >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()</span> |
| <span class="sd"> [Row(name='Bob', age=5)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="n">on</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">on</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="p">[</span><span class="n">on</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">on</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">on</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">on</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">on</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">Column</span><span class="p">),</span> <span class="s2">"on should be Column or list of Column"</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="fm">__and__</span><span class="p">(</span><span class="n">y</span><span class="p">),</span> <span class="n">on</span><span class="p">)</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="n">on</span><span class="o">.</span><span class="n">_jc</span> |
| |
| <span class="k">if</span> <span class="n">on</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">how</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">how</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">how</span> <span class="o">=</span> <span class="s2">"inner"</span> |
| <span class="k">if</span> <span class="n">on</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">([])</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">how</span><span class="p">,</span> <span class="nb">str</span><span class="p">),</span> <span class="s2">"how should be a string"</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">,</span> <span class="n">on</span><span class="p">,</span> <span class="n">how</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sortWithinPartitions"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.sortWithinPartitions.html#pyspark.sql.DataFrame.sortWithinPartitions">[docs]</a> <span class="k">def</span> <span class="nf">sortWithinPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str, list or :class:`Column`, optional</span> |
| <span class="sd"> list of :class:`Column` or column names to sort by.</span> |
| |
| <span class="sd"> Other Parameters</span> |
| <span class="sd"> ----------------</span> |
| <span class="sd"> ascending : bool or list, optional</span> |
| <span class="sd"> boolean or list of boolean (default ``True``).</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span> |
| <span class="sd"> If a list is specified, length of the list must equal length of the `cols`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.sortWithinPartitions("age", ascending=False).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">sortWithinPartitions</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sort_cols</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">kwargs</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sort"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.sort.html#pyspark.sql.DataFrame.sort">[docs]</a> <span class="k">def</span> <span class="nf">sort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` sorted by the specified column(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str, list, or :class:`Column`, optional</span> |
| <span class="sd"> list of :class:`Column` or column names to sort by.</span> |
| |
| <span class="sd"> Other Parameters</span> |
| <span class="sd"> ----------------</span> |
| <span class="sd"> ascending : bool or list, optional</span> |
| <span class="sd"> boolean or list of boolean (default ``True``).</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span> |
| <span class="sd"> If a list is specified, length of the list must equal length of the `cols`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.sort(df.age.desc()).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob'), Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> df.sort("age", ascending=False).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob'), Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> df.orderBy(df.age.desc()).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob'), Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> from pyspark.sql.functions import *</span> |
| <span class="sd"> >>> df.sort(asc("age")).collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df.orderBy(desc("age"), "name").collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob'), Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> df.orderBy(["age", "name"], ascending=[0, 1]).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob'), Row(age=2, name='Alice')]</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sort_cols</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">kwargs</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <span class="n">orderBy</span> <span class="o">=</span> <span class="n">sort</span> |
| |
| <span class="k">def</span> <span class="nf">_jseq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">converter</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Return a JVM Seq of Columns from a list of Column or names"""</span> |
| <span class="k">return</span> <span class="n">_to_seq</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">converter</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_jmap</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">jm</span><span class="p">):</span> |
| <span class="sd">"""Return a JVM Scala Map from a dict"""</span> |
| <span class="k">return</span> <span class="n">_to_scala_map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">jm</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_jcols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Return a JVM Seq of Columns from a list of Column or column names</span> |
| |
| <span class="sd"> If `cols` has only one list in it, cols[0] will be used as the list.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_sort_cols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">kwargs</span><span class="p">):</span> |
| <span class="sd">""" Return a JVM Seq of Columns that describes the sort order</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">cols</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"should sort by at least one column"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">]</span> |
| <span class="n">ascending</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'ascending'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ascending</span><span class="p">,</span> <span class="p">(</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">int</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">ascending</span><span class="p">:</span> |
| <span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">jc</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> <span class="k">for</span> <span class="n">jc</span> <span class="ow">in</span> <span class="n">jcols</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ascending</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">jc</span> <span class="k">if</span> <span class="n">asc</span> <span class="k">else</span> <span class="n">jc</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">asc</span><span class="p">,</span> <span class="n">jc</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">ascending</span><span class="p">,</span> <span class="n">jcols</span><span class="p">)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"ascending can only be boolean or list, but got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">ascending</span><span class="p">))</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">jcols</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.describe"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.describe.html#pyspark.sql.DataFrame.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Computes basic statistics for numeric and string columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> This include count, mean, stddev, min, and max. If no columns are</span> |
| <span class="sd"> given, this function computes statistics for all numerical or string columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> Use summary for expanded statistics and control over which statistics to compute.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.describe(['age']).show()</span> |
| <span class="sd"> +-------+------------------+</span> |
| <span class="sd"> |summary| age|</span> |
| <span class="sd"> +-------+------------------+</span> |
| <span class="sd"> | count| 2|</span> |
| <span class="sd"> | mean| 3.5|</span> |
| <span class="sd"> | stddev|2.1213203435596424|</span> |
| <span class="sd"> | min| 2|</span> |
| <span class="sd"> | max| 5|</span> |
| <span class="sd"> +-------+------------------+</span> |
| <span class="sd"> >>> df.describe().show()</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| <span class="sd"> |summary| age| name|</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| <span class="sd"> | count| 2| 2|</span> |
| <span class="sd"> | mean| 3.5| null|</span> |
| <span class="sd"> | stddev|2.1213203435596424| null|</span> |
| <span class="sd"> | min| 2|Alice|</span> |
| <span class="sd"> | max| 5| Bob|</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.summary</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">describe</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">cols</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.summary"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.summary.html#pyspark.sql.DataFrame.summary">[docs]</a> <span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">statistics</span><span class="p">):</span> |
| <span class="sd">"""Computes specified statistics for numeric and string columns. Available statistics are:</span> |
| <span class="sd"> - count</span> |
| <span class="sd"> - mean</span> |
| <span class="sd"> - stddev</span> |
| <span class="sd"> - min</span> |
| <span class="sd"> - max</span> |
| <span class="sd"> - arbitrary approximate percentiles specified as a percentage (e.g., 75%)</span> |
| |
| <span class="sd"> If no statistics are given, this function computes count, mean, stddev, min,</span> |
| <span class="sd"> approximate quartiles (percentiles at 25%, 50%, and 75%), and max.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.summary().show()</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| <span class="sd"> |summary| age| name|</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| <span class="sd"> | count| 2| 2|</span> |
| <span class="sd"> | mean| 3.5| null|</span> |
| <span class="sd"> | stddev|2.1213203435596424| null|</span> |
| <span class="sd"> | min| 2|Alice|</span> |
| <span class="sd"> | 25%| 2| null|</span> |
| <span class="sd"> | 50%| 2| null|</span> |
| <span class="sd"> | 75%| 5| null|</span> |
| <span class="sd"> | max| 5| Bob|</span> |
| <span class="sd"> +-------+------------------+-----+</span> |
| |
| <span class="sd"> >>> df.summary("count", "min", "25%", "75%", "max").show()</span> |
| <span class="sd"> +-------+---+-----+</span> |
| <span class="sd"> |summary|age| name|</span> |
| <span class="sd"> +-------+---+-----+</span> |
| <span class="sd"> | count| 2| 2|</span> |
| <span class="sd"> | min| 2|Alice|</span> |
| <span class="sd"> | 25%| 2| null|</span> |
| <span class="sd"> | 75%| 5| null|</span> |
| <span class="sd"> | max| 5| Bob|</span> |
| <span class="sd"> +-------+---+-----+</span> |
| |
| <span class="sd"> To do a summary for specific columns first select them:</span> |
| |
| <span class="sd"> >>> df.select("age", "name").summary("count").show()</span> |
| <span class="sd"> +-------+---+----+</span> |
| <span class="sd"> |summary|age|name|</span> |
| <span class="sd"> +-------+---+----+</span> |
| <span class="sd"> | count| 2| 2|</span> |
| <span class="sd"> +-------+---+----+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.display</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">statistics</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">statistics</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">statistics</span> <span class="o">=</span> <span class="n">statistics</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">statistics</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.head"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.head.html#pyspark.sql.DataFrame.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Returns the first ``n`` rows.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should only be used if the resulting array is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, optional</span> |
| <span class="sd"> default 1. Number of rows to return.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> If n is greater than 1, return a list of :class:`Row`.</span> |
| <span class="sd"> If n is 1, return a single Row.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.head()</span> |
| <span class="sd"> Row(age=2, name='Alice')</span> |
| <span class="sd"> >>> df.head(1)</span> |
| <span class="sd"> [Row(age=2, name='Alice')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">rs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">rs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">rs</span> <span class="k">else</span> <span class="kc">None</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.first"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.first.html#pyspark.sql.DataFrame.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Returns the first row as a :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.first()</span> |
| <span class="sd"> Row(age=2, name='Alice')</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">()</span></div> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="sd">"""Returns the column as a :class:`Column`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.select(df['age']).collect()</span> |
| <span class="sd"> [Row(age=2), Row(age=5)]</span> |
| <span class="sd"> >>> df[ ["name", "age"]].collect()</span> |
| <span class="sd"> [Row(name='Alice', age=2), Row(name='Bob', age=5)]</span> |
| <span class="sd"> >>> df[ df.age > 3 ].collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df[df[0] > 3].collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">jc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">item</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="n">jc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="n">item</span><span class="p">])</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unexpected item type: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">item</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="sd">"""Returns the :class:`Column` denoted by ``name``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.select(df.age).collect()</span> |
| <span class="sd"> [Row(age=2), Row(age=5)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span> |
| <span class="s2">"'</span><span class="si">%s</span><span class="s2">' object has no attribute '</span><span class="si">%s</span><span class="s2">'"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span> |
| <span class="n">jc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.select"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.select.html#pyspark.sql.DataFrame.select">[docs]</a> <span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Projects a set of expressions and returns a new :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str, :class:`Column`, or list</span> |
| <span class="sd"> column names (string) or expressions (:class:`Column`).</span> |
| <span class="sd"> If one of the column names is '*', that column is expanded to include all columns</span> |
| <span class="sd"> in the current :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.select('*').collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice'), Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df.select('name', 'age').collect()</span> |
| <span class="sd"> [Row(name='Alice', age=2), Row(name='Bob', age=5)]</span> |
| <span class="sd"> >>> df.select(df.name, (df.age + 10).alias('age')).collect()</span> |
| <span class="sd"> [Row(name='Alice', age=12), Row(name='Bob', age=15)]</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.selectExpr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.selectExpr.html#pyspark.sql.DataFrame.selectExpr">[docs]</a> <span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">):</span> |
| <span class="sd">"""Projects a set of SQL expressions and returns a new :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is a variant of :func:`select` that accepts SQL expressions.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.selectExpr("age * 2", "abs(age)").collect()</span> |
| <span class="sd"> [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">expr</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">expr</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">expr</span> <span class="o">=</span> <span class="n">expr</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">expr</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.filter"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.filter.html#pyspark.sql.DataFrame.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">condition</span><span class="p">):</span> |
| <span class="sd">"""Filters rows using the given condition.</span> |
| |
| <span class="sd"> :func:`where` is an alias for :func:`filter`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> condition : :class:`Column` or str</span> |
| <span class="sd"> a :class:`Column` of :class:`types.BooleanType`</span> |
| <span class="sd"> or a string of SQL expression.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.filter(df.age > 3).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df.where(df.age == 2).collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice')]</span> |
| |
| <span class="sd"> >>> df.filter("age > 3").collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob')]</span> |
| <span class="sd"> >>> df.where("age = 2").collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">condition</span><span class="o">.</span><span class="n">_jc</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"condition should be string or Column"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.groupBy"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.groupBy.html#pyspark.sql.DataFrame.groupBy">[docs]</a> <span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Groups the :class:`DataFrame` using the specified columns,</span> |
| <span class="sd"> so we can run aggregation on them. See :class:`GroupedData`</span> |
| <span class="sd"> for all the available aggregate functions.</span> |
| |
| <span class="sd"> :func:`groupby` is an alias for :func:`groupBy`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list, str or :class:`Column`</span> |
| <span class="sd"> columns to group by.</span> |
| <span class="sd"> Each element should be a column name (string) or an expression (:class:`Column`).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.groupBy().avg().collect()</span> |
| <span class="sd"> [Row(avg(age)=3.5)]</span> |
| <span class="sd"> >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())</span> |
| <span class="sd"> [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]</span> |
| <span class="sd"> >>> sorted(df.groupBy(df.name).avg().collect())</span> |
| <span class="sd"> [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]</span> |
| <span class="sd"> >>> sorted(df.groupBy(['name', df.age]).count().collect())</span> |
| <span class="sd"> [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]</span> |
| <span class="sd"> """</span> |
| <span class="n">jgd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">))</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.group</span> <span class="kn">import</span> <span class="n">GroupedData</span> |
| <span class="k">return</span> <span class="n">GroupedData</span><span class="p">(</span><span class="n">jgd</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.rollup"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.rollup.html#pyspark.sql.DataFrame.rollup">[docs]</a> <span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Create a multi-dimensional rollup for the current :class:`DataFrame` using</span> |
| <span class="sd"> the specified columns, so we can run aggregation on them.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.rollup("name", df.age).count().orderBy("name", "age").show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | null|null| 2|</span> |
| <span class="sd"> |Alice|null| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|null| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="n">jgd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">rollup</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">))</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.group</span> <span class="kn">import</span> <span class="n">GroupedData</span> |
| <span class="k">return</span> <span class="n">GroupedData</span><span class="p">(</span><span class="n">jgd</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.cube"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.cube.html#pyspark.sql.DataFrame.cube">[docs]</a> <span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Create a multi-dimensional cube for the current :class:`DataFrame` using</span> |
| <span class="sd"> the specified columns, so we can run aggregations on them.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.cube("name", df.age).count().orderBy("name", "age").show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | null|null| 2|</span> |
| <span class="sd"> | null| 2| 1|</span> |
| <span class="sd"> | null| 5| 1|</span> |
| <span class="sd"> |Alice|null| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|null| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="n">jgd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">cube</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jcols</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">))</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.group</span> <span class="kn">import</span> <span class="n">GroupedData</span> |
| <span class="k">return</span> <span class="n">GroupedData</span><span class="p">(</span><span class="n">jgd</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.agg"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.agg.html#pyspark.sql.DataFrame.agg">[docs]</a> <span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">exprs</span><span class="p">):</span> |
| <span class="sd">""" Aggregate on the entire :class:`DataFrame` without groups</span> |
| <span class="sd"> (shorthand for ``df.groupBy().agg()``).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.agg({"age": "max"}).collect()</span> |
| <span class="sd"> [Row(max(age)=5)]</span> |
| <span class="sd"> >>> from pyspark.sql import functions as F</span> |
| <span class="sd"> >>> df.agg(F.min(df.age)).collect()</span> |
| <span class="sd"> [Row(min(age)=2)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">groupBy</span><span class="p">()</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.union"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.union.html#pyspark.sql.DataFrame.union">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="mf">2.0</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">union</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">""" Return a new :class:`DataFrame` containing union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union</span> |
| <span class="sd"> (that does deduplication of elements), use this function followed by :func:`distinct`.</span> |
| |
| <span class="sd"> Also as standard in SQL, this function resolves columns by position (not by name).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unionAll"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.unionAll.html#pyspark.sql.DataFrame.unionAll">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">unionAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">""" Return a new :class:`DataFrame` containing union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union</span> |
| <span class="sd"> (that does deduplication of elements), use this function followed by :func:`distinct`.</span> |
| |
| <span class="sd"> Also as standard in SQL, this function resolves columns by position (not by name).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">other</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unionByName"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.unionByName.html#pyspark.sql.DataFrame.unionByName">[docs]</a> <span class="k">def</span> <span class="nf">unionByName</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">allowMissingColumns</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="sd">""" Returns a new :class:`DataFrame` containing union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set</span> |
| <span class="sd"> union (that does deduplication of elements), use this function followed by :func:`distinct`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> The difference between this function and :func:`union` is that this function</span> |
| <span class="sd"> resolves columns by name (not by position):</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])</span> |
| <span class="sd"> >>> df1.unionByName(df2).show()</span> |
| <span class="sd"> +----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|</span> |
| <span class="sd"> +----+----+----+</span> |
| <span class="sd"> | 1| 2| 3|</span> |
| <span class="sd"> | 6| 4| 5|</span> |
| <span class="sd"> +----+----+----+</span> |
| |
| <span class="sd"> When the parameter `allowMissingColumns` is ``True``, the set of column names</span> |
| <span class="sd"> in this and other :class:`DataFrame` can differ; missing columns will be filled with null.</span> |
| <span class="sd"> Further, the missing columns of this :class:`DataFrame` will be added at the end</span> |
| <span class="sd"> in the schema of the union result:</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])</span> |
| <span class="sd"> >>> df1.unionByName(df2, allowMissingColumns=True).show()</span> |
| <span class="sd"> +----+----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|col3|</span> |
| <span class="sd"> +----+----+----+----+</span> |
| <span class="sd"> | 1| 2| 3|null|</span> |
| <span class="sd"> |null| 4| 5| 6|</span> |
| <span class="sd"> +----+----+----+----+</span> |
| |
| <span class="sd"> .. versionchanged:: 3.1.0</span> |
| <span class="sd"> Added optional argument `allowMissingColumns` to specify whether to allow</span> |
| <span class="sd"> missing columns.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">unionByName</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">,</span> <span class="n">allowMissingColumns</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.intersect"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.intersect.html#pyspark.sql.DataFrame.intersect">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">intersect</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">""" Return a new :class:`DataFrame` containing rows only in</span> |
| <span class="sd"> both this :class:`DataFrame` and another :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is equivalent to `INTERSECT` in SQL.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">intersect</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.intersectAll"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.intersectAll.html#pyspark.sql.DataFrame.intersectAll">[docs]</a> <span class="k">def</span> <span class="nf">intersectAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">""" Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`</span> |
| <span class="sd"> and another :class:`DataFrame` while preserving duplicates.</span> |
| |
| <span class="sd"> This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function</span> |
| <span class="sd"> resolves columns by position (not by name).</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| |
| <span class="sd"> >>> df1.intersectAll(df2).sort("C1", "C2").show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | b| 3|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">intersectAll</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.subtract"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.subtract.html#pyspark.sql.DataFrame.subtract">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">subtract</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">""" Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`</span> |
| <span class="sd"> but not in another :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is equivalent to `EXCEPT DISTINCT` in SQL.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="p">,</span> <span class="s2">"except"</span><span class="p">)(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropDuplicates"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.dropDuplicates.html#pyspark.sql.DataFrame.dropDuplicates">[docs]</a> <span class="k">def</span> <span class="nf">dropDuplicates</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Return a new :class:`DataFrame` with duplicate rows removed,</span> |
| <span class="sd"> optionally only considering certain columns.</span> |
| |
| <span class="sd"> For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming</span> |
| <span class="sd"> :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop</span> |
| <span class="sd"> duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can</span> |
| <span class="sd"> be and system will accordingly limit the state. In addition, too late data older than</span> |
| <span class="sd"> watermark will be dropped to avoid any possibility of duplicates.</span> |
| |
| <span class="sd"> :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = sc.parallelize([ \\</span> |
| <span class="sd"> ... Row(name='Alice', age=5, height=80), \\</span> |
| <span class="sd"> ... Row(name='Alice', age=5, height=80), \\</span> |
| <span class="sd"> ... Row(name='Alice', age=10, height=80)]).toDF()</span> |
| <span class="sd"> >>> df.dropDuplicates().show()</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> | name|age|height|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> |Alice| 5| 80|</span> |
| <span class="sd"> |Alice| 10| 80|</span> |
| <span class="sd"> +-----+---+------+</span> |
| |
| <span class="sd"> >>> df.dropDuplicates(['name', 'height']).show()</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> | name|age|height|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> |Alice| 5| 80|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">str</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Parameter 'subset' must be a list of columns"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">dropDuplicates</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">dropDuplicates</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">subset</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropna"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.dropna.html#pyspark.sql.DataFrame.dropna">[docs]</a> <span class="k">def</span> <span class="nf">dropna</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">'any'</span><span class="p">,</span> <span class="n">thresh</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` omitting rows with null values.</span> |
| <span class="sd"> :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> how : str, optional</span> |
| <span class="sd"> 'any' or 'all'.</span> |
| <span class="sd"> If 'any', drop a row if it contains any nulls.</span> |
| <span class="sd"> If 'all', drop a row only if all its values are null.</span> |
| <span class="sd"> thresh: int, optional</span> |
| <span class="sd"> default None</span> |
| <span class="sd"> If specified, drop rows that have less than `thresh` non-null values.</span> |
| <span class="sd"> This overwrites the `how` parameter.</span> |
| <span class="sd"> subset : str, tuple or list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df4.na.drop().show()</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">how</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">how</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'any'</span><span class="p">,</span> <span class="s1">'all'</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"how ('"</span> <span class="o">+</span> <span class="n">how</span> <span class="o">+</span> <span class="s2">"') should be 'any' or 'all'"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">subset</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">subset</span> <span class="o">=</span> <span class="p">[</span><span class="n">subset</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"subset should be a list or tuple of column names"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">thresh</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">thresh</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">subset</span><span class="p">)</span> <span class="k">if</span> <span class="n">how</span> <span class="o">==</span> <span class="s1">'any'</span> <span class="k">else</span> <span class="mi">1</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">thresh</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">subset</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.fillna"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.fillna.html#pyspark.sql.DataFrame.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Replace null values, alias for ``na.fill()``.</span> |
| <span class="sd"> :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : int, float, string, bool or dict</span> |
| <span class="sd"> Value to replace null values with.</span> |
| <span class="sd"> If the value is a dict, then `subset` is ignored and `value` must be a mapping</span> |
| <span class="sd"> from column name (string) to replacement value. The replacement value must be</span> |
| <span class="sd"> an int, float, boolean, or string.</span> |
| <span class="sd"> subset : str, tuple or list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| <span class="sd"> Columns specified in subset that do not have matching data type are ignored.</span> |
| <span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span> |
| <span class="sd"> then the non-string column is simply ignored.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df4.na.fill(50).show()</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> | 5| 50| Bob|</span> |
| <span class="sd"> | 50| 50| Tom|</span> |
| <span class="sd"> | 50| 50| null|</span> |
| <span class="sd"> +---+------+-----+</span> |
| |
| <span class="sd"> >>> df5.na.fill(False).show()</span> |
| <span class="sd"> +----+-------+-----+</span> |
| <span class="sd"> | age| name| spy|</span> |
| <span class="sd"> +----+-------+-----+</span> |
| <span class="sd"> | 10| Alice|false|</span> |
| <span class="sd"> | 5| Bob|false|</span> |
| <span class="sd"> |null|Mallory| true|</span> |
| <span class="sd"> +----+-------+-----+</span> |
| |
| <span class="sd"> >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()</span> |
| <span class="sd"> +---+------+-------+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-------+</span> |
| <span class="sd"> | 10| 80| Alice|</span> |
| <span class="sd"> | 5| null| Bob|</span> |
| <span class="sd"> | 50| null| Tom|</span> |
| <span class="sd"> | 50| null|unknown|</span> |
| <span class="sd"> +---+------+-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">bool</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"value should be a float, int, string, bool or dict"</span><span class="p">)</span> |
| |
| <span class="c1"># Note that bool validates isinstance(int), but we don't want to</span> |
| <span class="c1"># convert bools to floats</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">fill</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">fill</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">subset</span> <span class="o">=</span> <span class="p">[</span><span class="n">subset</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"subset should be a list or tuple of column names"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">fill</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">subset</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.replace"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.replace.html#pyspark.sql.DataFrame.replace">[docs]</a> <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="o">=</span><span class="n">_NoValue</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` replacing a value with another value.</span> |
| <span class="sd"> :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are</span> |
| <span class="sd"> aliases of each other.</span> |
| <span class="sd"> Values to_replace and value must have the same type and can only be numerics, booleans,</span> |
| <span class="sd"> or strings. Value can have None. When replacing, the new value will be cast</span> |
| <span class="sd"> to the type of the existing column.</span> |
| <span class="sd"> For numeric replacements all values to be replaced should have unique</span> |
| <span class="sd"> floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)</span> |
| <span class="sd"> and arbitrary replacement will be used.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> to_replace : bool, int, float, string, list or dict</span> |
| <span class="sd"> Value to be replaced.</span> |
| <span class="sd"> If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`</span> |
| <span class="sd"> must be a mapping between a value and a replacement.</span> |
| <span class="sd"> value : bool, int, float, string or None, optional</span> |
| <span class="sd"> The replacement value must be a bool, int, float, string or None. If `value` is a</span> |
| <span class="sd"> list, `value` should be of the same length and type as `to_replace`.</span> |
| <span class="sd"> If `value` is a scalar and `to_replace` is a sequence, then `value` is</span> |
| <span class="sd"> used as a replacement for each item in `to_replace`.</span> |
| <span class="sd"> subset : list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| <span class="sd"> Columns specified in subset that do not have matching data type are ignored.</span> |
| <span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span> |
| <span class="sd"> then the non-string column is simply ignored.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df4.na.replace(10, 20).show()</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | age|height| name|</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | 20| 80|Alice|</span> |
| <span class="sd"> | 5| null| Bob|</span> |
| <span class="sd"> |null| null| Tom|</span> |
| <span class="sd"> |null| null| null|</span> |
| <span class="sd"> +----+------+-----+</span> |
| |
| <span class="sd"> >>> df4.na.replace('Alice', None).show()</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | age|height|name|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | 10| 80|null|</span> |
| <span class="sd"> | 5| null| Bob|</span> |
| <span class="sd"> |null| null| Tom|</span> |
| <span class="sd"> |null| null|null|</span> |
| <span class="sd"> +----+------+----+</span> |
| |
| <span class="sd"> >>> df4.na.replace({'Alice': None}).show()</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | age|height|name|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | 10| 80|null|</span> |
| <span class="sd"> | 5| null| Bob|</span> |
| <span class="sd"> |null| null| Tom|</span> |
| <span class="sd"> |null| null|null|</span> |
| <span class="sd"> +----+------+----+</span> |
| |
| <span class="sd"> >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | age|height|name|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | 10| 80| A|</span> |
| <span class="sd"> | 5| null| B|</span> |
| <span class="sd"> |null| null| Tom|</span> |
| <span class="sd"> |null| null|null|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="n">_NoValue</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"value argument is required when to_replace is not a dictionary."</span><span class="p">)</span> |
| |
| <span class="c1"># Helper functions</span> |
| <span class="k">def</span> <span class="nf">all_of</span><span class="p">(</span><span class="n">types</span><span class="p">):</span> |
| <span class="sd">"""Given a type or tuple of types and a sequence of xs</span> |
| <span class="sd"> check if each x is instance of type(s)</span> |
| |
| <span class="sd"> >>> all_of(bool)([True, False])</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> all_of(str)(["a", 1])</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="nf">all_of_</span><span class="p">(</span><span class="n">xs</span><span class="p">):</span> |
| <span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">types</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">xs</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">all_of_</span> |
| |
| <span class="n">all_of_bool</span> <span class="o">=</span> <span class="n">all_of</span><span class="p">(</span><span class="nb">bool</span><span class="p">)</span> |
| <span class="n">all_of_str</span> <span class="o">=</span> <span class="n">all_of</span><span class="p">(</span><span class="nb">str</span><span class="p">)</span> |
| <span class="n">all_of_numeric</span> <span class="o">=</span> <span class="n">all_of</span><span class="p">((</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">))</span> |
| |
| <span class="c1"># Validate input types</span> |
| <span class="n">valid_types</span> <span class="o">=</span> <span class="p">(</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="n">valid_types</span> <span class="o">+</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"to_replace should be a bool, float, int, string, list, tuple, or dict. "</span> |
| <span class="s2">"Got </span><span class="si">{0}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">valid_types</span><span class="p">)</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> \ |
| <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"If to_replace is not a dict, value should be "</span> |
| <span class="s2">"a bool, float, int, string, list, tuple or None. "</span> |
| <span class="s2">"Got </span><span class="si">{0}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">value</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">))</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"to_replace and value lists should be of the same length. "</span> |
| <span class="s2">"Got </span><span class="si">{0}</span><span class="s2"> and </span><span class="si">{1}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">,</span> <span class="nb">str</span><span class="p">))):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"subset should be a list or tuple of column names, "</span> |
| <span class="s2">"column name or None. Got </span><span class="si">{0}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">subset</span><span class="p">)))</span> |
| |
| <span class="c1"># Reshape input arguments if necessary</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">)):</span> |
| <span class="n">to_replace</span> <span class="o">=</span> <span class="p">[</span><span class="n">to_replace</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">rep_dict</span> <span class="o">=</span> <span class="n">to_replace</span> |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"to_replace is a dict and value is not None. value will be ignored."</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">))</span> <span class="ow">or</span> <span class="n">value</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="p">[</span><span class="n">value</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">))]</span> |
| <span class="n">rep_dict</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">subset</span> <span class="o">=</span> <span class="p">[</span><span class="n">subset</span><span class="p">]</span> |
| |
| <span class="c1"># Verify we were not passed in mixed type generics.</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="n">all_of_type</span><span class="p">(</span><span class="n">rep_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span> |
| <span class="ow">and</span> <span class="n">all_of_type</span><span class="p">(</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">rep_dict</span><span class="o">.</span><span class="n">values</span><span class="p">()</span> <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">all_of_type</span> <span class="ow">in</span> <span class="p">[</span><span class="n">all_of_bool</span><span class="p">,</span> <span class="n">all_of_str</span><span class="p">,</span> <span class="n">all_of_numeric</span><span class="p">]):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Mixed type replacements are not supported"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'*'</span><span class="p">,</span> <span class="n">rep_dict</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">na</span><span class="p">()</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">subset</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jmap</span><span class="p">(</span><span class="n">rep_dict</span><span class="p">)),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.approxQuantile"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.approxQuantile.html#pyspark.sql.DataFrame.approxQuantile">[docs]</a> <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">probabilities</span><span class="p">,</span> <span class="n">relativeError</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Calculates the approximate quantiles of numerical columns of a</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> The result of this algorithm has the following deterministic bound:</span> |
| <span class="sd"> If the :class:`DataFrame` has N elements and if we request the quantile at</span> |
| <span class="sd"> probability `p` up to error `err`, then the algorithm will return</span> |
| <span class="sd"> a sample `x` from the :class:`DataFrame` so that the *exact* rank of `x` is</span> |
| <span class="sd"> close to (p * N). More precisely,</span> |
| |
| <span class="sd"> floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).</span> |
| |
| <span class="sd"> This method implements a variation of the Greenwald-Khanna</span> |
| <span class="sd"> algorithm (with some speed optimizations). The algorithm was first</span> |
| <span class="sd"> present in [[https://doi.org/10.1145/375663.375670</span> |
| <span class="sd"> Space-efficient Online Computation of Quantile Summaries]]</span> |
| <span class="sd"> by Greenwald and Khanna.</span> |
| |
| <span class="sd"> Note that null values will be ignored in numerical columns before calculation.</span> |
| <span class="sd"> For columns only containing null values, an empty list is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col: str, tuple or list</span> |
| <span class="sd"> Can be a single column name, or a list of names for multiple columns.</span> |
| |
| <span class="sd"> .. versionchanged:: 2.2</span> |
| <span class="sd"> Added support for multiple columns.</span> |
| <span class="sd"> probabilities : list or tuple</span> |
| <span class="sd"> a list of quantile probabilities</span> |
| <span class="sd"> Each number must belong to [0, 1].</span> |
| <span class="sd"> For example 0 is the minimum, 0.5 is the median, 1 is the maximum.</span> |
| <span class="sd"> relativeError : float</span> |
| <span class="sd"> The relative target precision to achieve</span> |
| <span class="sd"> (>= 0). If set to zero, the exact quantiles are computed, which</span> |
| <span class="sd"> could be very expensive. Note that values greater than 1 are</span> |
| <span class="sd"> accepted but give the same result as 1.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> the approximate quantiles at the given probabilities. If</span> |
| <span class="sd"> the input `col` is a string, the output is a list of floats. If the</span> |
| <span class="sd"> input `col` is a list or tuple of strings, the output is also a</span> |
| <span class="sd"> list, but each element in it is a list of floats, i.e., the output</span> |
| <span class="sd"> is a list of list of floats.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col should be a string, list or tuple, but got </span><span class="si">%r</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| |
| <span class="n">isStr</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">isStr</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="p">]</span> |
| |
| <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">col</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"columns should be strings, but got </span><span class="si">%r</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">c</span><span class="p">))</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">_to_list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">probabilities</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"probabilities should be a list or tuple"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">probabilities</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">probabilities</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">probabilities</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">probabilities</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">p</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">))</span> <span class="ow">or</span> <span class="n">p</span> <span class="o"><</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">p</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"probabilities should be numerical (float, int) in [0,1]."</span><span class="p">)</span> |
| <span class="n">probabilities</span> <span class="o">=</span> <span class="n">_to_list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">probabilities</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">relativeError</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"relativeError should be numerical (float, int)"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">relativeError</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"relativeError should be >= 0."</span><span class="p">)</span> |
| <span class="n">relativeError</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">relativeError</span><span class="p">)</span> |
| |
| <span class="n">jaq</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">approxQuantile</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">probabilities</span><span class="p">,</span> <span class="n">relativeError</span><span class="p">)</span> |
| <span class="n">jaq_list</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="n">jaq</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">jaq_list</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">isStr</span> <span class="k">else</span> <span class="n">jaq_list</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.corr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.corr.html#pyspark.sql.DataFrame.corr">[docs]</a> <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Calculates the correlation of two columns of a :class:`DataFrame` as a double value.</span> |
| <span class="sd"> Currently only supports the Pearson Correlation Coefficient.</span> |
| <span class="sd"> :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column</span> |
| <span class="sd"> method : str, optional</span> |
| <span class="sd"> The correlation method. Currently only supports "pearson"</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col1 should be a string."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col2</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col2 should be a string."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">method</span><span class="p">:</span> |
| <span class="n">method</span> <span class="o">=</span> <span class="s2">"pearson"</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"pearson"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Currently only the calculation of the Pearson Correlation "</span> <span class="o">+</span> |
| <span class="s2">"coefficient is supported."</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.cov"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.cov.html#pyspark.sql.DataFrame.cov">[docs]</a> <span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Calculate the sample covariance for the given columns, specified by their names, as a</span> |
| <span class="sd"> double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col1 should be a string."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col2</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col2 should be a string."</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">cov</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.crosstab"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.crosstab.html#pyspark.sql.DataFrame.crosstab">[docs]</a> <span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes a pair-wise frequency table of the given columns. Also known as a contingency</span> |
| <span class="sd"> table. The number of distinct values for each column should be less than 1e4. At most 1e6</span> |
| <span class="sd"> non-zero pair frequencies will be returned.</span> |
| <span class="sd"> The first column of each row will be the distinct values of `col1` and the column names</span> |
| <span class="sd"> will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.</span> |
| <span class="sd"> Pairs that have no occurrences will have zero as their counts.</span> |
| <span class="sd"> :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column. Distinct items will make the first item of</span> |
| <span class="sd"> each row.</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column. Distinct items will make the column names</span> |
| <span class="sd"> of the :class:`DataFrame`.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col1 should be a string."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col2</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col2 should be a string."</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.freqItems"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.freqItems.html#pyspark.sql.DataFrame.freqItems">[docs]</a> <span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">support</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Finding frequent items for columns, possibly with false positives. Using the</span> |
| <span class="sd"> frequent element count algorithm described in</span> |
| <span class="sd"> "https://doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".</span> |
| <span class="sd"> :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list or tuple</span> |
| <span class="sd"> Names of the columns to calculate frequent items for as a list or tuple of</span> |
| <span class="sd"> strings.</span> |
| <span class="sd"> support : float, optional</span> |
| <span class="sd"> The frequency with which to consider an item 'frequent'. Default is 1%.</span> |
| <span class="sd"> The support must be greater than 1e-4.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"cols must be a list or tuple of column names as strings."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">support</span><span class="p">:</span> |
| <span class="n">support</span> <span class="o">=</span> <span class="mf">0.01</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">stat</span><span class="p">()</span><span class="o">.</span><span class="n">freqItems</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">),</span> <span class="n">support</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withColumn"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.withColumn.html#pyspark.sql.DataFrame.withColumn">[docs]</a> <span class="k">def</span> <span class="nf">withColumn</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">,</span> <span class="n">col</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` by adding a column or replacing the</span> |
| <span class="sd"> existing column that has the same name.</span> |
| |
| <span class="sd"> The column expression must be an expression over this :class:`DataFrame`; attempting to add</span> |
| <span class="sd"> a column from some other :class:`DataFrame` will raise an error.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colName : str</span> |
| <span class="sd"> string, name of the new column.</span> |
| <span class="sd"> col : :class:`Column`</span> |
| <span class="sd"> a :class:`Column` expression for the new column.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method introduces a projection internally. Therefore, calling it multiple</span> |
| <span class="sd"> times, for instance, via loops in order to add multiple columns can generate big</span> |
| <span class="sd"> plans which can cause performance issues and even `StackOverflowException`.</span> |
| <span class="sd"> To avoid this, use :func:`select` with the multiple columns at once.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.withColumn('age2', df.age + 2).collect()</span> |
| <span class="sd"> [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)]</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col should be Column"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">colName</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">_jc</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withColumnRenamed"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.withColumnRenamed.html#pyspark.sql.DataFrame.withColumnRenamed">[docs]</a> <span class="k">def</span> <span class="nf">withColumnRenamed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">existing</span><span class="p">,</span> <span class="n">new</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` by renaming an existing column.</span> |
| <span class="sd"> This is a no-op if schema doesn't contain the given column name.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> existing : str</span> |
| <span class="sd"> string, name of the existing column to rename.</span> |
| <span class="sd"> new : str</span> |
| <span class="sd"> string, new name of the column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.withColumnRenamed('age', 'age2').collect()</span> |
| <span class="sd"> [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="n">existing</span><span class="p">,</span> <span class="n">new</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.drop"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.drop.html#pyspark.sql.DataFrame.drop">[docs]</a> <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` that drops the specified column.</span> |
| <span class="sd"> This is a no-op if schema doesn't contain the given column name(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols: str or :class:`Column`</span> |
| <span class="sd"> a name of the column, or the :class:`Column` to drop</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.drop('age').collect()</span> |
| <span class="sd"> [Row(name='Alice'), Row(name='Bob')]</span> |
| |
| <span class="sd"> >>> df.drop(df.age).collect()</span> |
| <span class="sd"> [Row(name='Alice'), Row(name='Bob')]</span> |
| |
| <span class="sd"> >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()</span> |
| <span class="sd"> [Row(age=5, height=85, name='Bob')]</span> |
| |
| <span class="sd"> >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()</span> |
| <span class="sd"> [Row(age=5, name='Bob', height=85)]</span> |
| |
| <span class="sd"> >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()</span> |
| <span class="sd"> [Row(name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">_jc</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"col should be a string or a Column"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"each col in the param list should be a string"</span><span class="p">)</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">cols</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toDF"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.toDF.html#pyspark.sql.DataFrame.toDF">[docs]</a> <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame` that with new specified column names</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str</span> |
| <span class="sd"> new column names</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.toDF('f1', 'f2').collect()</span> |
| <span class="sd"> [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">toDF</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jseq</span><span class="p">(</span><span class="n">cols</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.transform"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.transform.html#pyspark.sql.DataFrame.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">):</span> |
| <span class="sd">"""Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> a function that takes and returns a :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])</span> |
| <span class="sd"> >>> def cast_all_to_int(input_df):</span> |
| <span class="sd"> ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])</span> |
| <span class="sd"> >>> def sort_columns_asc(input_df):</span> |
| <span class="sd"> ... return input_df.select(*sorted(input_df.columns))</span> |
| <span class="sd"> >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |float|int|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 2| 2|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">),</span> <span class="s2">"Func returned an instance of type [</span><span class="si">%s</span><span class="s2">], "</span> \ |
| <span class="s2">"should have been DataFrame."</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">result</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sameSemantics"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.sameSemantics.html#pyspark.sql.DataFrame.sameSemantics">[docs]</a> <span class="k">def</span> <span class="nf">sameSemantics</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and</span> |
| <span class="sd"> therefore return same results.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The equality comparison here is simplified by tolerating the cosmetic differences</span> |
| <span class="sd"> such as attribute names.</span> |
| |
| <span class="sd"> This API can compare both :class:`DataFrame`\\s very fast but can still return</span> |
| <span class="sd"> `False` on the :class:`DataFrame` that return the same results, for instance, from</span> |
| <span class="sd"> different plans. Such false negative semantic can be useful when caching as an example.</span> |
| |
| <span class="sd"> This API is a developer API.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.range(10)</span> |
| <span class="sd"> >>> df2 = spark.range(10)</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id * 2))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id + 2))</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col0", df2.id * 2))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"other parameter should be of DataFrame; however, got </span><span class="si">%s</span><span class="s2">"</span> |
| <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">))</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">sameSemantics</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.semanticHash"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.semanticHash.html#pyspark.sql.DataFrame.semanticHash">[docs]</a> <span class="k">def</span> <span class="nf">semanticHash</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a hash code of the logical query plan against this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Unlike the standard hash code, the hash is calculated against the query plan</span> |
| <span class="sd"> simplified by tolerating the cosmetic differences such as attribute names.</span> |
| |
| <span class="sd"> This API is a developer API.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(10).selectExpr("id as col0").semanticHash() # doctest: +SKIP</span> |
| <span class="sd"> 1855039936</span> |
| <span class="sd"> >>> spark.range(10).selectExpr("id as col1").semanticHash() # doctest: +SKIP</span> |
| <span class="sd"> 1855039936</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">semanticHash</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.inputFiles"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.inputFiles.html#pyspark.sql.DataFrame.inputFiles">[docs]</a> <span class="k">def</span> <span class="nf">inputFiles</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a best-effort snapshot of the files that compose this :class:`DataFrame`.</span> |
| <span class="sd"> This method simply asks each constituent BaseRelation for its respective files and</span> |
| <span class="sd"> takes the union of all results. Depending on the source relations, this may not find</span> |
| <span class="sd"> all input files. Duplicates are removed.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.read.load("examples/src/main/resources/people.json", format="json")</span> |
| <span class="sd"> >>> len(df.inputFiles())</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">inputFiles</span><span class="p">())</span></div> |
| |
| <span class="n">where</span> <span class="o">=</span> <span class="n">copy_func</span><span class="p">(</span> |
| <span class="nb">filter</span><span class="p">,</span> |
| <span class="n">sinceversion</span><span class="o">=</span><span class="mf">1.3</span><span class="p">,</span> |
| <span class="n">doc</span><span class="o">=</span><span class="s2">":func:`where` is an alias for :func:`filter`."</span><span class="p">)</span> |
| |
| <span class="c1"># Two aliases below were added for pandas compatibility many years ago.</span> |
| <span class="c1"># There are too many differences compared to pandas and we cannot just</span> |
| <span class="c1"># make it "compatible" by adding aliases. Therefore, we stop adding such</span> |
| <span class="c1"># aliases as of Spark 3.0. Two methods below remain just</span> |
| <span class="c1"># for legacy users currently.</span> |
| <span class="n">groupby</span> <span class="o">=</span> <span class="n">copy_func</span><span class="p">(</span> |
| <span class="n">groupBy</span><span class="p">,</span> |
| <span class="n">sinceversion</span><span class="o">=</span><span class="mf">1.4</span><span class="p">,</span> |
| <span class="n">doc</span><span class="o">=</span><span class="s2">":func:`groupby` is an alias for :func:`groupBy`."</span><span class="p">)</span> |
| |
| <span class="n">drop_duplicates</span> <span class="o">=</span> <span class="n">copy_func</span><span class="p">(</span> |
| <span class="n">dropDuplicates</span><span class="p">,</span> |
| <span class="n">sinceversion</span><span class="o">=</span><span class="mf">1.4</span><span class="p">,</span> |
| <span class="n">doc</span><span class="o">=</span><span class="s2">":func:`drop_duplicates` is an alias for :func:`dropDuplicates`."</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.writeTo"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.writeTo.html#pyspark.sql.DataFrame.writeTo">[docs]</a> <span class="k">def</span> <span class="nf">writeTo</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Create a write configuration builder for v2 sources.</span> |
| |
| <span class="sd"> This builder is used to configure and execute write operations.</span> |
| |
| <span class="sd"> For example, to append or create or replace existing tables.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").append() # doctest: +SKIP</span> |
| <span class="sd"> >>> df.writeTo( # doctest: +SKIP</span> |
| <span class="sd"> ... "catalog.db.table"</span> |
| <span class="sd"> ... ).partitionedBy("col").createOrReplace()</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrameWriterV2</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_pandas_on_spark"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrame.to_pandas_on_spark.html#pyspark.sql.DataFrame.to_pandas_on_spark">[docs]</a> <span class="k">def</span> <span class="nf">to_pandas_on_spark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Converts the existing DataFrame into a pandas-on-Spark DataFrame.</span> |
| |
| <span class="sd"> If a pandas-on-Spark DataFrame is converted to a Spark DataFrame and then back</span> |
| <span class="sd"> to pandas-on-Spark, it will lose the index information and the original index</span> |
| <span class="sd"> will be turned into a normal column.</span> |
| |
| <span class="sd"> This is only available if Pandas is installed and available.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.frame.DataFrame.to_spark</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.show() # doctest: +SKIP</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> |Col1|Col2|</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> | c| 3|</span> |
| <span class="sd"> +----+----+</span> |
| |
| <span class="sd"> >>> df.to_pandas_on_spark() # doctest: +SKIP</span> |
| <span class="sd"> Col1 Col2</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> 2 c 3</span> |
| |
| <span class="sd"> We can specify the index columns.</span> |
| |
| <span class="sd"> >>> df.to_pandas_on_spark(index_col="Col1"): # doctest: +SKIP</span> |
| <span class="sd"> Col2</span> |
| <span class="sd"> Col1</span> |
| <span class="sd"> a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> c 3</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.namespace</span> <span class="kn">import</span> <span class="n">_get_index_map</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="n">InternalFrame</span> |
| |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="bp">self</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <span class="c1"># Keep to_koalas for backward compatibility for now.</span> |
| <span class="k">def</span> <span class="nf">to_koalas</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"DataFrame.to_koalas is deprecated. Use DataFrame.to_pandas_on_spark instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_pandas_on_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_to_scala_map</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">jm</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert a dict into a JVM Map.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonUtils</span><span class="o">.</span><span class="n">toScalaMap</span><span class="p">(</span><span class="n">jm</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameNaFunctions.html#pyspark.sql.DataFrameNaFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameNaFunctions</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> |
| <span class="sd">"""Functionality for working with missing data in :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.drop"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameNaFunctions.drop.html#pyspark.sql.DataFrameNaFunctions.drop">[docs]</a> <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">'any'</span><span class="p">,</span> <span class="n">thresh</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">,</span> <span class="n">thresh</span><span class="o">=</span><span class="n">thresh</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="n">subset</span><span class="p">)</span></div> |
| |
| <span class="n">drop</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">dropna</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.fill"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameNaFunctions.fill.html#pyspark.sql.DataFrameNaFunctions.fill">[docs]</a> <span class="k">def</span> <span class="nf">fill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="n">subset</span><span class="p">)</span></div> |
| |
| <span class="n">fill</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">fillna</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.replace"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameNaFunctions.replace.html#pyspark.sql.DataFrameNaFunctions.replace">[docs]</a> <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="o">=</span><span class="n">_NoValue</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">subset</span><span class="p">)</span></div> |
| |
| <span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span></div> |
| |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.html#pyspark.sql.DataFrameStatFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameStatFunctions</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> |
| <span class="sd">"""Functionality for statistic functions with :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.approxQuantile"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.approxQuantile.html#pyspark.sql.DataFrameStatFunctions.approxQuantile">[docs]</a> <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">probabilities</span><span class="p">,</span> <span class="n">relativeError</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">approxQuantile</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">probabilities</span><span class="p">,</span> <span class="n">relativeError</span><span class="p">)</span></div> |
| |
| <span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.corr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.corr.html#pyspark.sql.DataFrameStatFunctions.corr">[docs]</a> <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span></div> |
| |
| <span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.cov"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.cov.html#pyspark.sql.DataFrameStatFunctions.cov">[docs]</a> <span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">cov</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| <span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.crosstab"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.crosstab.html#pyspark.sql.DataFrameStatFunctions.crosstab">[docs]</a> <span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| <span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.freqItems"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.freqItems.html#pyspark.sql.DataFrameStatFunctions.freqItems">[docs]</a> <span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">support</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">freqItems</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">support</span><span class="p">)</span></div> |
| |
| <span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.sampleBy"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.DataFrameStatFunctions.sampleBy.html#pyspark.sql.DataFrameStatFunctions.sampleBy">[docs]</a> <span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">fractions</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">df</span><span class="o">.</span><span class="n">sampleBy</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">fractions</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span></div> |
| |
| <span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">():</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">from</span> <span class="nn">pyspark.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SQLContext</span><span class="p">,</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.sql.dataframe</span> |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">dataframe</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="p">(</span><span class="s1">'local[4]'</span><span class="p">,</span> <span class="s1">'PythonTest'</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'sc'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'sqlContext'</span><span class="p">]</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'spark'</span><span class="p">]</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'df'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([(</span><span class="mi">2</span><span class="p">,</span> <span class="s1">'Alice'</span><span class="p">),</span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s1">'Bob'</span><span class="p">)])</span>\ |
| <span class="o">.</span><span class="n">toDF</span><span class="p">(</span><span class="n">StructType</span><span class="p">([</span><span class="n">StructField</span><span class="p">(</span><span class="s1">'age'</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">()),</span> |
| <span class="n">StructField</span><span class="p">(</span><span class="s1">'name'</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())]))</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'df2'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">height</span><span class="o">=</span><span class="mi">80</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Tom'</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="n">height</span><span class="o">=</span><span class="mi">85</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Bob'</span><span class="p">)])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'df3'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Alice'</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Bob'</span><span class="p">)])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'df4'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">height</span><span class="o">=</span><span class="mi">80</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Alice'</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">height</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Bob'</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">height</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Tom'</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">height</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="kc">None</span><span class="p">)])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'df5'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Alice'</span><span class="p">,</span> <span class="n">spy</span><span class="o">=</span><span class="kc">False</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Bob'</span><span class="p">,</span> <span class="n">spy</span><span class="o">=</span><span class="kc">None</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">'Mallory'</span><span class="p">,</span> <span class="n">spy</span><span class="o">=</span><span class="kc">True</span><span class="p">)])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'sdf'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s1">'Tom'</span><span class="p">,</span> <span class="n">time</span><span class="o">=</span><span class="mi">1479441846</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s1">'Bob'</span><span class="p">,</span> <span class="n">time</span><span class="o">=</span><span class="mi">1479442946</span><span class="p">)])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span> |
| |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">dataframe</span><span class="p">,</span> <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">REPORT_NDIFF</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'sc'</span><span class="p">]</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |