| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Chapter 1: DataFrames - A view into your structured data — PySpark 4.1.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script> |
| <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/dataframes';</script> |
| <script> |
| DOCUMENTATION_OPTIONS.theme_switcher_json_url = 'https://spark.apache.org/static/versions.json'; |
| DOCUMENTATION_OPTIONS.theme_switcher_version_match = '4.1.0-preview1'; |
| </script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/dataframes.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Chapter 2: A Tour of PySpark Data Types" href="touroftypes.html" /> |
| <link rel="prev" title="User Guide" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Chapter 1: DataFrames - A view into your structured data</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="touroftypes.html">Chapter 2: A Tour of PySpark Data Types</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="dataprep.html">Chapter 3: Function Junction - Data manipulation with PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="bugbusting.html">Chapter 4: Bug Busting - Debugging PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="udfandudtf.html">Chapter 5: Unleashing UDFs & UDTFs</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="sql.html">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="loadandbehold.html">Chapter 7: Load and Behold - Data loading, storage, file formats</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guide</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Chapter 1: DataFrames - A view into your structured data</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="Chapter-1:-DataFrames---A-view-into-your-structured-data"> |
| <h1>Chapter 1: DataFrames - A view into your structured data<a class="headerlink" href="#Chapter-1:-DataFrames---A-view-into-your-structured-data" title="Permalink to this headline">#</a></h1> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">pyspark</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| Requirement already satisfied: pyspark in /Users/amanda.liu/anaconda3/lib/python3.10/site-packages (3.5.0) |
| Requirement already satisfied: py4j==0.10.9.7 in /Users/amanda.liu/anaconda3/lib/python3.10/site-packages (from pyspark) (0.10.9.7) |
| Note: you may need to restart the kernel to use updated packages. |
| </pre></div></div> |
| </div> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span> \ |
| <span class="o">.</span><span class="n">builder</span> \ |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"Python Spark SQL basic example"</span><span class="p">)</span> \ |
| <span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">"spark.some.config.option"</span><span class="p">,</span> <span class="s2">"some-value"</span><span class="p">)</span> \ |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <p>This section introduces the most fundamental data structure in PySpark: the DataFrame.</p> |
| <p>A DataFrame is a two-dimensional labeled data structure with columns of potentially different types. You can think of a DataFrame like a spreadsheet, a SQL table, or a dictionary of series objects. Apache Spark DataFrames support a rich set of APIs (select columns, filter, join, aggregate, etc.) that allow you to solve common data analysis problems efficiently.</p> |
| <p>Compared to traditional relational databases, Spark DataFrames offer several key advantages for big data processing and analytics:</p> |
| <ul class="simple"> |
| <li><p><strong>Distributed computing</strong>: Spark distributes data across multiple nodes in a cluster, allowing for parallel processing of big data</p></li> |
| <li><p><strong>In-memory processing</strong>: Spark performs computations in memory, which can be significantly faster than disk-based processing</p></li> |
| <li><p><strong>Schema flexibility</strong>: Unlike traditional databases, PySpark DataFrames support schema evolution and dynamic typing.</p></li> |
| <li><p><strong>Fault tolerance</strong>: PySpark DataFrames are built on top of Resilient Distributed Dataset (RDDs), which are inherently fault-tolerant. Spark automatically handles node failures and data replication, ensuring data reliability and integrity.</p></li> |
| </ul> |
| <p>A note on RDDs: Direct use of RDDs are no longer supported on Spark Connect as of Spark 4.0. Interacting directly with Spark DataFrames uses a unified planning and optimization engine, allowing us to get nearly identical performance across all supported languages on Databricks (Python, SQL, Scala, and R).</p> |
| <section id="Create-a-DataFrame"> |
| <h2>Create a DataFrame<a class="headerlink" href="#Create-a-DataFrame" title="Permalink to this headline">#</a></h2> |
| <p>There are several ways to create a DataFrame in PySpark.</p> |
| <section id="From-a-list-of-dictionaries"> |
| <h3>From a list of dictionaries<a class="headerlink" href="#From-a-list-of-dictionaries" title="Permalink to this headline">#</a></h3> |
| <p>The simplest way is to use the createDataFrame() method like so:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">employees</span> <span class="o">=</span> <span class="p">[{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"John D."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">30</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Alice G."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">25</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Bob T."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">35</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Eve A."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">28</span><span class="p">}]</span> |
| |
| <span class="c1"># Create a DataFrame containing the employees data</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">employees</span><span class="p">)</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area stderr docutils container"> |
| <div class="highlight"><pre> |
| |
| </pre></div></div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+--------+ |
| |age| name| |
| +---+--------+ |
| | 30| John D.| |
| | 25|Alice G.| |
| | 35| Bob T.| |
| | 28| Eve A.| |
| +---+--------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="From-a-local-file"> |
| <h3>From a local file<a class="headerlink" href="#From-a-local-file" title="Permalink to this headline">#</a></h3> |
| <p>We can also create a DataFrame from a local CSV file:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s2">"../data/employees.csv"</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inferSchema</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Role| Location| |
| +-----------+-----------------+-----------------+ |
| | 19238| Data Analyst| Seattle, WA| |
| | 19239|Software Engineer| Seattle, WA| |
| | 19240| IT Specialist| Seattle, WA| |
| | 19241| Data Analyst| New York, NY| |
| | 19242| Recruiter|San Francisco, CA| |
| | 19243| Product Manager| New York, NY| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Or from a local JSON file:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"multiline"</span><span class="p">,</span><span class="s2">"true"</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">(</span><span class="s2">"../data/employees.json"</span><span class="p">)</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Location| Role| |
| +-----------+-----------------+-----------------+ |
| | 19238| Seattle, WA| Data Analyst| |
| | 19239| Seattle, WA|Software Engineer| |
| | 19240| Seattle, WA| IT Specialist| |
| | 19241| New York, NY| Data Analyst| |
| | 19242|San Francisco, CA| Recruiter| |
| | 19243| New York, NY| Product Manager| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="From-an-existing-DataFrame"> |
| <h3>From an existing DataFrame<a class="headerlink" href="#From-an-existing-DataFrame" title="Permalink to this headline">#</a></h3> |
| <p>We can even create a DataFrame from another existing DataFrame, by selecting certain columns:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">employees</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"John D."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">30</span><span class="p">,</span> <span class="s2">"department"</span><span class="p">:</span> <span class="s2">"HR"</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Alice G."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">25</span><span class="p">,</span> <span class="s2">"department"</span><span class="p">:</span> <span class="s2">"Finance"</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Bob T."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">35</span><span class="p">,</span> <span class="s2">"department"</span><span class="p">:</span> <span class="s2">"IT"</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Eve A."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">28</span><span class="p">,</span> <span class="s2">"department"</span><span class="p">:</span> <span class="s2">"Marketing"</span><span class="p">}</span> |
| <span class="p">]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">employees</span><span class="p">)</span> |
| |
| <span class="c1"># Select only the name and age columns</span> |
| <span class="n">new_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"name"</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| </section> |
| <section id="From-a-table"> |
| <h3>From a table<a class="headerlink" href="#From-a-table" title="Permalink to this headline">#</a></h3> |
| <p>If you have an existing table <code class="docutils literal notranslate"><span class="pre">table_name</span></code> in your Spark environment, you can create a DataFrame like this:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">"table_name"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| </section> |
| <section id="From-a-database"> |
| <h3>From a database<a class="headerlink" href="#From-a-database" title="Permalink to this headline">#</a></h3> |
| <p>If your table is in a database, you can use JDBC to read the table into a DataFrame.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">url</span> <span class="o">=</span> <span class="s2">"jdbc:mysql://localhost:3306/mydatabase"</span> |
| <span class="n">table</span> <span class="o">=</span> <span class="s2">"employees"</span> |
| <span class="n">properties</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="s2">"user"</span><span class="p">:</span> <span class="s2">"username"</span><span class="p">,</span> |
| <span class="s2">"password"</span><span class="p">:</span> <span class="s2">"password"</span> |
| <span class="p">}</span> |
| |
| <span class="c1"># Read table into DataFrame</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">jdbc</span><span class="p">(</span><span class="n">url</span><span class="o">=</span><span class="n">url</span><span class="p">,</span> <span class="n">table</span><span class="o">=</span><span class="n">table</span><span class="p">,</span> <span class="n">properties</span><span class="o">=</span><span class="n">properties</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| </section> |
| </section> |
| <section id="View-the-DataFrame"> |
| <h2>View the DataFrame<a class="headerlink" href="#View-the-DataFrame" title="Permalink to this headline">#</a></h2> |
| <p>We can use PySpark to view and interact with our DataFrame.</p> |
| <section id="Display-the-DataFrame"> |
| <h3>Display the DataFrame<a class="headerlink" href="#Display-the-DataFrame" title="Permalink to this headline">#</a></h3> |
| <p><code class="docutils literal notranslate"><span class="pre">df.show()</span></code> displays a basic visualization of the DataFrame’s contents. From our above <code class="docutils literal notranslate"><span class="pre">createDataFrame()</span></code> example:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">employees</span> <span class="o">=</span> <span class="p">[{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"John D."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">30</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Alice G."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">25</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Bob T."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">35</span><span class="p">},</span> |
| <span class="p">{</span><span class="s2">"name"</span><span class="p">:</span> <span class="s2">"Eve A."</span><span class="p">,</span> <span class="s2">"age"</span><span class="p">:</span> <span class="mi">28</span><span class="p">}]</span> |
| |
| <span class="c1"># Create a DataFrame containing the employees data</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">employees</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+--------+ |
| |age| name| |
| +---+--------+ |
| | 30| John D.| |
| | 25|Alice G.| |
| | 35| Bob T.| |
| | 28| Eve A.| |
| +---+--------+ |
| |
| </pre></div></div> |
| </div> |
| <p><code class="docutils literal notranslate"><span class="pre">df.show()</span></code> has 3 optional arguments: <code class="docutils literal notranslate"><span class="pre">n</span></code>, <code class="docutils literal notranslate"><span class="pre">truncate</span></code>, and <code class="docutils literal notranslate"><span class="pre">vertical</span></code>.</p> |
| <p>By default, <code class="docutils literal notranslate"><span class="pre">df.show()</span></code> displays up to the first 20 rows of the DataFrame. We can control the number of rows displayed by passing an argument to the show() method:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+--------+ |
| |age| name| |
| +---+--------+ |
| | 30| John D.| |
| | 25|Alice G.| |
| +---+--------+ |
| only showing top 2 rows |
| |
| </pre></div></div> |
| </div> |
| <p>The truncate argument controls the length of displayed column values (default value is 20):</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+----+ |
| |age|name| |
| +---+----+ |
| | 30| Joh| |
| | 25| Ali| |
| | 35| Bob| |
| | 28| Eve| |
| +---+----+ |
| |
| </pre></div></div> |
| </div> |
| <p>If we set <code class="docutils literal notranslate"><span class="pre">vertical</span></code> to True, the DataFrame will be displayed vertically with one line per value:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">vertical</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| -RECORD 0-------- |
| age | 30 |
| name | John D. |
| -RECORD 1-------- |
| age | 25 |
| name | Alice G. |
| -RECORD 2-------- |
| age | 35 |
| name | Bob T. |
| -RECORD 3-------- |
| age | 28 |
| name | Eve A. |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Print-the-DataFrame-schema"> |
| <h3>Print the DataFrame schema<a class="headerlink" href="#Print-the-DataFrame-schema" title="Permalink to this headline">#</a></h3> |
| <p>We can view information about the DataFrame schema using the <code class="docutils literal notranslate"><span class="pre">printSchema()</span></code> method:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| root |
| |-- age: long (nullable = true) |
| |-- name: string (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| <section id="DataFrame-Manipulation"> |
| <h2>DataFrame Manipulation<a class="headerlink" href="#DataFrame-Manipulation" title="Permalink to this headline">#</a></h2> |
| <p>Let’s look at some ways we can transform our DataFrames.</p> |
| <p>For more detailed information, please see the section about data manipulation, <a class="reference external" href="https://databricks-eng.github.io/pyspark-cookbook/07-dataprep.html">Chapter 6: Function Junction - Data manipulation with PySpark</a>.</p> |
| <section id="Rename-columns"> |
| <h3>Rename columns<a class="headerlink" href="#Rename-columns" title="Permalink to this headline">#</a></h3> |
| <p>We can rename DataFrame columns using the <code class="docutils literal notranslate"><span class="pre">withColumnRenamed()</span></code> method:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s2">"name"</span><span class="p">,</span> <span class="s2">"full_name"</span><span class="p">)</span> |
| <span class="n">df2</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+--------+ |
| |age| name| |
| +---+--------+ |
| | 30| John D.| |
| | 25|Alice G.| |
| | 35| Bob T.| |
| | 28| Eve A.| |
| +---+--------+ |
| |
| +---+---------+ |
| |age|full_name| |
| +---+---------+ |
| | 30| John D.| |
| | 25| Alice G.| |
| | 35| Bob T.| |
| | 28| Eve A.| |
| +---+---------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Filter-rows"> |
| <h3>Filter rows<a class="headerlink" href="#Filter-rows" title="Permalink to this headline">#</a></h3> |
| <p>We can filter for employees within a certain age range. The following <code class="docutils literal notranslate"><span class="pre">df.filter</span></code> will create a new DataFrame with rows that match our age condition:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">filtered_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">((</span><span class="n">df</span><span class="p">[</span><span class="s2">"age"</span><span class="p">]</span> <span class="o">></span> <span class="mi">26</span><span class="p">)</span> <span class="o">&</span> <span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"age"</span><span class="p">]</span> <span class="o"><</span> <span class="mi">32</span><span class="p">))</span> |
| <span class="n">filtered_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+-------+ |
| |age| name| |
| +---+-------+ |
| | 30|John D.| |
| | 28| Eve A.| |
| +---+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>We can also use <code class="docutils literal notranslate"><span class="pre">df.where</span></code> to get the same result:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">where_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">where</span><span class="p">((</span><span class="n">df</span><span class="p">[</span><span class="s2">"age"</span><span class="p">]</span> <span class="o">></span> <span class="mi">26</span><span class="p">)</span> <span class="o">&</span> <span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"age"</span><span class="p">]</span> <span class="o"><</span> <span class="mi">32</span><span class="p">))</span> |
| <span class="n">where_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+-------+ |
| |age| name| |
| +---+-------+ |
| | 30|John D.| |
| | 28| Eve A.| |
| +---+-------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| <section id="DataFrames-vs.-Tables"> |
| <h2>DataFrames vs. Tables<a class="headerlink" href="#DataFrames-vs.-Tables" title="Permalink to this headline">#</a></h2> |
| <p>A DataFrame is an immutable distributed collection of data, only available in the current Spark session.</p> |
| <p>A table is a persistent data structure that can be accessed across multiple Spark sessions.</p> |
| <p>If you wish to promote a DataFrame to a table, you can use the <code class="docutils literal notranslate"><span class="pre">createOrReplaceTempView()</span></code> method:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[19]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s2">"employees"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <p>Note that the lifetime of this temporary table is tied to the SparkSession that was used to create this DataFrame. To persist the table beyond this Spark session, you will need to save it to persistent storage.</p> |
| </section> |
| <section id="Save-DataFrame-to-Persistent-Storage"> |
| <h2>Save DataFrame to Persistent Storage<a class="headerlink" href="#Save-DataFrame-to-Persistent-Storage" title="Permalink to this headline">#</a></h2> |
| <p>There are several ways to save a DataFrame to persistent storage in PySpark. For more detailed information about saving data to your local environment, please see the section about Data Loading (TODO: add link).</p> |
| <section id="Save-to-file-based-data-source"> |
| <h3>Save to file-based data source<a class="headerlink" href="#Save-to-file-based-data-source" title="Permalink to this headline">#</a></h3> |
| <p>For file-based data source (text, parquet, json, etc.), you can specify a custom table path like so:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[20]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"path"</span><span class="p">,</span> <span class="s2">"../dataout"</span><span class="p">)</span><span class="o">.</span><span class="n">saveAsTable</span><span class="p">(</span><span class="s2">"dataframes_savetable_example"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <p>Even if the table is dropped, the custom table path and table data will still be there.</p> |
| <p>If no custom table path is specified, Spark will write data to a default table path under the warehouse directory. When the table is dropped, the default table path will be removed too.</p> |
| </section> |
| <section id="Save-to-Hive-metastore"> |
| <h3>Save to Hive metastore<a class="headerlink" href="#Save-to-Hive-metastore" title="Permalink to this headline">#</a></h3> |
| <p>To save to Hive metastore, you can use the following:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[21]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="p">()</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="s2">"overwrite"</span><span class="p">)</span><span class="o">.</span><span class="n">saveAsTable</span><span class="p">(</span><span class="s2">"schemaName.tableName"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| </section> |
| </section> |
| <section id="Native-DataFrame-Plotting"> |
| <h2>Native DataFrame Plotting<a class="headerlink" href="#Native-DataFrame-Plotting" title="Permalink to this headline">#</a></h2> |
| <p>PySpark supports native plotting, allowing users to visualize data directly from PySpark DataFrames.</p> |
| <p>The user interacts with PySpark Plotting by calling the <code class="docutils literal notranslate"><span class="pre">plot</span></code> property on a PySpark DataFrame and specifying the desired type of plot, either as a submethod or by setting the <code class="docutils literal notranslate"><span class="pre">kind</span></code> parameter. For instance:</p> |
| <p><code class="docutils literal notranslate"><span class="pre">df.plot.line(x="category",</span> <span class="pre">y="int_val")</span></code></p> |
| <p>or equivalently:</p> |
| <p><code class="docutils literal notranslate"><span class="pre">df.plot(kind="line",</span> <span class="pre">x="category",</span> <span class="pre">y="int_val")</span></code></p> |
| <p>The feature is powered by <a class="reference external" href="https://plotly.com/python/">Plotly</a> as the default visualization backend, offering rich, interactive plotting capabilities, while native <a class="reference external" href="https://pandas.pydata.org/">pandas</a> is used internally to process data for most plots.</p> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="index.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">User Guide</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="touroftypes.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Chapter 2: A Tour of PySpark Data Types</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Create-a-DataFrame">Create a DataFrame</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#From-a-list-of-dictionaries">From a list of dictionaries</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#From-a-local-file">From a local file</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#From-an-existing-DataFrame">From an existing DataFrame</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#From-a-table">From a table</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#From-a-database">From a database</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#View-the-DataFrame">View the DataFrame</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Display-the-DataFrame">Display the DataFrame</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Print-the-DataFrame-schema">Print the DataFrame schema</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#DataFrame-Manipulation">DataFrame Manipulation</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Rename-columns">Rename columns</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Filter-rows">Filter rows</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#DataFrames-vs.-Tables">DataFrames vs. Tables</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Save-DataFrame-to-Persistent-Storage">Save DataFrame to Persistent Storage</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Save-to-file-based-data-source">Save to file-based data source</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Save-to-Hive-metastore">Save to Hive metastore</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Native-DataFrame-Plotting">Native DataFrame Plotting</a></li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/user_guide/dataframes.ipynb.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2025 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |