| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Quickstart: DataFrame — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script> |
| <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'getting_started/quickstart_df';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart: Spark Connect" href="quickstart_connect.html" /> |
| <link rel="prev" title="Installation" href="install.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "getting_started/quickstart_df.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "getting_started/quickstart_df.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Quickstart: DataFrame</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="quickstart_connect.html">Quickstart: Spark Connect</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="quickstart_ps.html">Quickstart: Pandas API on Spark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="testing_pyspark.html">Testing PySpark</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">Getting Started</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Quickstart: DataFrame</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="Quickstart:-DataFrame"> |
| <h1>Quickstart: DataFrame<a class="headerlink" href="#Quickstart:-DataFrame" title="Permalink to this headline">#</a></h1> |
| <p>This is a short introduction and quickstart for the PySpark DataFrame API. PySpark DataFrames are lazily evaluated. They are implemented on top of <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#overview">RDD</a>s. When Spark <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#transformations">transforms</a> data, it does not immediately compute the transformation but plans how to compute later. When |
| <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#actions">actions</a> such as <code class="docutils literal notranslate"><span class="pre">collect()</span></code> are explicitly called, the computation starts. This notebook shows the basic usages of the DataFrame, geared mainly for new users. You can run the latest version of these examples by yourself in ‘Live Notebook: DataFrame’ at <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/getting_started/index.html">the quickstart page</a>.</p> |
| <p>There is also other useful information in Apache Spark documentation site, see the latest version of <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL and DataFrames</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html">RDD Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">Structured Streaming Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/streaming-programming-guide.html">Spark Streaming Programming |
| Guide</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/ml-guide.html">Machine Learning Library (MLlib) Guide</a>.</p> |
| <p>PySpark applications start with initializing <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code> which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <section id="DataFrame-Creation"> |
| <h2>DataFrame Creation<a class="headerlink" href="#DataFrame-Creation" title="Permalink to this headline">#</a></h2> |
| <p>A PySpark DataFrame can be created via <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> typically by passing a list of lists, tuples, dictionaries and <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code>s, a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> and an RDD consisting of such a list. <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> takes the <code class="docutils literal notranslate"><span class="pre">schema</span></code> argument to specify the schema of the DataFrame. When it is omitted, PySpark infers the corresponding schema by taking a sample from |
| the data.</p> |
| <p>Firstly, you can create a PySpark DataFrame from a list of rows</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">date</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">2.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string1'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">3.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string2'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">5.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string3'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">])</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame with an explicit schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="s1">'string1'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">4.</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'a long, b double, c string, d date, e timestamp'</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame from a pandas DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">pandas_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span> |
| <span class="s1">'a'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> |
| <span class="s1">'b'</span><span class="p">:</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">],</span> |
| <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'string1'</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">],</span> |
| <span class="s1">'d'</span><span class="p">:</span> <span class="p">[</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">)],</span> |
| <span class="s1">'e'</span><span class="p">:</span> <span class="p">[</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)]</span> |
| <span class="p">})</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>The DataFrames created above all have the same results and schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># All DataFrames above result same.</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Viewing-Data"> |
| <h2>Viewing Data<a class="headerlink" href="#Viewing-Data" title="Permalink to this headline">#</a></h2> |
| <p>The top rows of a DataFrame can be displayed using <code class="docutils literal notranslate"><span class="pre">DataFrame.show()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>Alternatively, you can enable <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.enabled</span></code> configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter. The number of rows to show can be controlled via <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.maxNumRows</span></code> configuration.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">'spark.sql.repl.eagerEval.enabled'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <table border='1' style="table-layout: auto;margin-right: auto;margin-left: 0;"> |
| <tr><th>a</th><th>b</th><th>c</th><th>d</th><th>e</th></tr> |
| <tr><td>1</td><td>2.0</td><td>string1</td><td>2000-01-01</td><td>2000-01-01 12:00:00</td></tr> |
| <tr><td>2</td><td>3.0</td><td>string2</td><td>2000-02-01</td><td>2000-01-02 12:00:00</td></tr> |
| <tr><td>3</td><td>4.0</td><td>string3</td><td>2000-03-01</td><td>2000-01-03 12:00:00</td></tr> |
| </table></div> |
| </div> |
| <p>The rows can also be shown vertically. This is useful when rows are too long to show horizontally.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">vertical</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| -RECORD 0------------------ |
| a | 1 |
| b | 2.0 |
| c | string1 |
| d | 2000-01-01 |
| e | 2000-01-01 12:00:00 |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>You can see the DataFrame’s schema and column names as follows:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">columns</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| ['a', 'b', 'c', 'd', 'e'] |
| </pre></div></div> |
| </div> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| <p>Show the summary of the DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"a"</span><span class="p">,</span> <span class="s2">"b"</span><span class="p">,</span> <span class="s2">"c"</span><span class="p">)</span><span class="o">.</span><span class="n">describe</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+---+---+-------+ |
| |summary| a| b| c| |
| +-------+---+---+-------+ |
| | count| 3| 3| 3| |
| | mean|2.0|3.0| null| |
| | stddev|1.0|1.0| null| |
| | min| 1|2.0|string1| |
| | max| 3|4.0|string3| |
| +-------+---+---+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p><code class="docutils literal notranslate"><span class="pre">DataFrame.collect()</span></code> collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)), |
| Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)), |
| Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>In order to avoid throwing an out-of-memory exception, use <code class="docutils literal notranslate"><span class="pre">DataFrame.take()</span></code> or <code class="docutils literal notranslate"><span class="pre">DataFrame.tail()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>PySpark DataFrame also provides the conversion back to a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> to leverage pandas API. Note that <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> also collects all data into the driver side that can easily cause an out-of-memory-error when the data is too large to fit into the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <div> |
| <style scoped> |
| .dataframe tbody tr th:only-of-type { |
| vertical-align: middle; |
| } |
| |
| .dataframe tbody tr th { |
| vertical-align: top; |
| } |
| |
| .dataframe thead th { |
| text-align: right; |
| } |
| </style> |
| <table border="1" class="dataframe" style="table-layout: auto;margin-right: auto;margin-left: 0;"> |
| <thead> |
| <tr style="text-align: right;"> |
| <th></th> |
| <th>a</th> |
| <th>b</th> |
| <th>c</th> |
| <th>d</th> |
| <th>e</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <th>0</th> |
| <td>1</td> |
| <td>2.0</td> |
| <td>string1</td> |
| <td>2000-01-01</td> |
| <td>2000-01-01 12:00:00</td> |
| </tr> |
| <tr> |
| <th>1</th> |
| <td>2</td> |
| <td>3.0</td> |
| <td>string2</td> |
| <td>2000-02-01</td> |
| <td>2000-01-02 12:00:00</td> |
| </tr> |
| <tr> |
| <th>2</th> |
| <td>3</td> |
| <td>4.0</td> |
| <td>string3</td> |
| <td>2000-03-01</td> |
| <td>2000-01-03 12:00:00</td> |
| </tr> |
| </tbody> |
| </table> |
| </div></div> |
| </div> |
| </section> |
| <section id="Selecting-and-Accessing-Data"> |
| <h2>Selecting and Accessing Data<a class="headerlink" href="#Selecting-and-Accessing-Data" title="Permalink to this headline">#</a></h2> |
| <p>PySpark DataFrame is lazily evaluated and simply selecting a column does not trigger the computation but it returns a <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">a</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| Column<b'a'> |
| </pre></div></div> |
| </div> |
| <p>In fact, most of column-wise operations return <code class="docutils literal notranslate"><span class="pre">Column</span></code>s.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">upper</span> |
| |
| <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="o">.</span><span class="n">isNull</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| True |
| </pre></div></div> |
| </div> |
| <p>These <code class="docutils literal notranslate"><span class="pre">Column</span></code>s can be used to select the columns from a DataFrame. For example, <code class="docutils literal notranslate"><span class="pre">DataFrame.select()</span></code> takes the <code class="docutils literal notranslate"><span class="pre">Column</span></code> instances that returns another DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+ |
| | c| |
| +-------+ |
| |string1| |
| |string2| |
| |string3| |
| +-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Assign new <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[19]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">'upper_c'</span><span class="p">,</span> <span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+-------+ |
| | a| b| c| d| e|upper_c| |
| +---+---+-------+----------+-------------------+-------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3| |
| +---+---+-------+----------+-------------------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>To select a subset of rows, use <code class="docutils literal notranslate"><span class="pre">DataFrame.filter()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[20]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Applying-a-Function"> |
| <h2>Applying a Function<a class="headerlink" href="#Applying-a-Function" title="Permalink to this headline">#</a></h2> |
| <p>PySpark supports various UDFs and APIs to allow users to execute Python native functions. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs">Pandas UDFs</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-function-apis">Pandas Function APIs</a>. For instance, the example below allows users to directly use the APIs in <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html">a pandas |
| Series</a> within Python native function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[21]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="s1">'long'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">pandas_plus_one</span><span class="p">(</span><span class="n">series</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="c1"># Simply plus one by using pandas Series.</span> |
| <span class="k">return</span> <span class="n">series</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">pandas_plus_one</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +------------------+ |
| |pandas_plus_one(a)| |
| +------------------+ |
| | 2| |
| | 3| |
| | 4| |
| +------------------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Another example is <code class="docutils literal notranslate"><span class="pre">DataFrame.mapInPandas</span></code> which allows users directly use the APIs in a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> without any restrictions such as the result length.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[22]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">pandas_filter_func</span><span class="p">(</span><span class="n">iterator</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">pandas_df</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> |
| <span class="k">yield</span> <span class="n">pandas_df</span><span class="p">[</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="n">pandas_filter_func</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Grouping-Data"> |
| <h2>Grouping Data<a class="headerlink" href="#Grouping-Data" title="Permalink to this headline">#</a></h2> |
| <p>PySpark DataFrame also provides a way of handling grouped data by using the common approach, split-apply-combine strategy. It groups the data by a certain condition applies a function to each group and then combines them back to the DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[23]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">10</span><span class="p">],</span> <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">20</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">30</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">40</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">50</span><span class="p">],</span> <span class="p">[</span><span class="s1">'black'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">60</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">70</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">80</span><span class="p">]],</span> <span class="n">schema</span><span class="o">=</span><span class="p">[</span><span class="s1">'color'</span><span class="p">,</span> <span class="s1">'fruit'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">])</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Grouping and then applying the <code class="docutils literal notranslate"><span class="pre">avg()</span></code> function to the resulting groups.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[24]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">avg</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+-------+-------+ |
| |color|avg(v1)|avg(v2)| |
| +-----+-------+-------+ |
| | red| 4.8| 48.0| |
| |black| 6.0| 60.0| |
| | blue| 3.0| 30.0| |
| +-----+-------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>You can also apply a Python native function against each group by using pandas API.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[25]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">plus_mean</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">v1</span><span class="o">=</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span> <span class="o">-</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">plus_mean</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| -3| 10| |
| | red|carrot| -1| 30| |
| | red|carrot| 0| 50| |
| | red|banana| 2| 70| |
| | red| grape| 3| 80| |
| |black|carrot| 0| 60| |
| | blue|banana| -1| 20| |
| | blue| grape| 1| 40| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Co-grouping and applying a function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[26]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">))</span> |
| |
| <span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s1">'x'</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'y'</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">merge_ordered</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge_ordered</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> |
| |
| <span class="n">df1</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">)</span><span class="o">.</span><span class="n">cogroup</span><span class="p">(</span><span class="n">df2</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">))</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span> |
| <span class="n">merge_ordered</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'time int, id int, v1 double, v2 string'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+---+---+---+ |
| | time| id| v1| v2| |
| +--------+---+---+---+ |
| |20000101| 1|1.0| x| |
| |20000102| 1|3.0| x| |
| |20000101| 2|2.0| y| |
| |20000102| 2|4.0| y| |
| +--------+---+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Getting-Data-In/Out"> |
| <h2>Getting Data In/Out<a class="headerlink" href="#Getting-Data-In/Out" title="Permalink to this headline">#</a></h2> |
| <p>CSV is straightforward and easy to use. Parquet and ORC are efficient and compact file formats to read and write faster.</p> |
| <p>There are many other data sources available in PySpark such as JDBC, text, binaryFile, Avro, etc. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL, DataFrames and Datasets Guide</a> in Apache Spark documentation.</p> |
| <section id="CSV"> |
| <h3>CSV<a class="headerlink" href="#CSV" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[27]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Parquet"> |
| <h3>Parquet<a class="headerlink" href="#Parquet" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[28]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="ORC"> |
| <h3>ORC<a class="headerlink" href="#ORC" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[29]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| <section id="Working-with-SQL"> |
| <h2>Working with SQL<a class="headerlink" href="#Working-with-SQL" title="Permalink to this headline">#</a></h2> |
| <p>DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly. For example, you can register the DataFrame as a table and run a SQL easily as below:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[30]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s2">"tableA"</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT count(*) from tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+ |
| |count(1)| |
| +--------+ |
| | 8| |
| +--------+ |
| |
| </pre></div></div> |
| </div> |
| <p>In addition, UDFs can be registered and invoked in SQL out of the box:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[31]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">"integer"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">add_one</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">spark</span><span class="o">.</span><span class="n">udf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">"add_one"</span><span class="p">,</span> <span class="n">add_one</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT add_one(v1) FROM tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| </pre></div></div> |
| </div> |
| <p>These SQL expressions can directly be mixed and used as PySpark columns.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[32]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">expr</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s1">'add_one(v1)'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="s1">'count(*)'</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| +--------------+ |
| |(count(1) > 0)| |
| +--------------+ |
| | true| |
| +--------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="install.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Installation</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="quickstart_connect.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Quickstart: Spark Connect</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#DataFrame-Creation">DataFrame Creation</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Viewing-Data">Viewing Data</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Selecting-and-Accessing-Data">Selecting and Accessing Data</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Applying-a-Function">Applying a Function</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Grouping-Data">Grouping Data</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Getting-Data-In/Out">Getting Data In/Out</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#CSV">CSV</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Parquet">Parquet</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#ORC">ORC</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Working-with-SQL">Working with SQL</a></li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/getting_started/quickstart_df.ipynb.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |