| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Chapter 7: Load and Behold - Data loading, storage, file formats — PySpark 4.1.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script> |
| <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/loadandbehold';</script> |
| <script> |
| DOCUMENTATION_OPTIONS.theme_switcher_json_url = 'https://spark.apache.org/static/versions.json'; |
| DOCUMENTATION_OPTIONS.theme_switcher_version_match = '4.1.0-preview1'; |
| </script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/loadandbehold.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="API Reference" href="../reference/index.html" /> |
| <link rel="prev" title="Chapter 6: Old SQL, New Tricks - Running SQL on PySpark" href="sql.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="dataframes.html">Chapter 1: DataFrames - A view into your structured data</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="touroftypes.html">Chapter 2: A Tour of PySpark Data Types</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="dataprep.html">Chapter 3: Function Junction - Data manipulation with PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="bugbusting.html">Chapter 4: Bug Busting - Debugging PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="udfandudtf.html">Chapter 5: Unleashing UDFs & UDTFs</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="sql.html">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</a></li> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Chapter 7: Load and Behold - Data loading, storage, file formats</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guide</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Chapter 7: Load and Behold - Data loading, storage, file formats</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="Chapter-7:-Load-and-Behold---Data-loading,-storage,-file-formats"> |
| <h1>Chapter 7: Load and Behold - Data loading, storage, file formats<a class="headerlink" href="#Chapter-7:-Load-and-Behold---Data-loading,-storage,-file-formats" title="Permalink to this headline">#</a></h1> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="o">!</span>pip<span class="w"> </span>install<span class="w"> </span><span class="nv">pyspark</span><span class="o">==</span><span class="m">4</span>.0.0.dev2 |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| Requirement already satisfied: pyspark==4.0.0.dev2 in /Users/amanda.liu/anaconda3/envs/llm-spark/lib/python3.11/site-packages (4.0.0.dev2) |
| Requirement already satisfied: py4j==0.10.9.7 in /Users/amanda.liu/anaconda3/envs/llm-spark/lib/python3.11/site-packages (from pyspark==4.0.0.dev2) (0.10.9.7) |
| </pre></div></div> |
| </div> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span> \ |
| <span class="o">.</span><span class="n">builder</span> \ |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"Data Loading and Storage Example"</span><span class="p">)</span> \ |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <p>This section covers how to read and write data in various formats using PySpark. You’ll learn how to load data from common file types (e.g., CSV, JSON, Parquet, ORC) and store data efficiently.</p> |
| <section id="Reading-Data"> |
| <h2>Reading Data<a class="headerlink" href="#Reading-Data" title="Permalink to this headline">#</a></h2> |
| <section id="1.1-Reading-CSV-Files"> |
| <h3>1.1 Reading CSV Files<a class="headerlink" href="#1.1-Reading-CSV-Files" title="Permalink to this headline">#</a></h3> |
| <p>CSV is one of the most common formats for data exchange. Here’s how to load a CSV file into a DataFrame:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">csv_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s2">"../data/employees.csv"</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inferSchema</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">csv_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Role| Location| |
| +-----------+-----------------+-----------------+ |
| | 19238| Data Analyst| Seattle, WA| |
| | 19239|Software Engineer| Seattle, WA| |
| | 19240| IT Specialist| Seattle, WA| |
| | 19241| Data Analyst| New York, NY| |
| | 19242| Recruiter|San Francisco, CA| |
| | 19243| Product Manager| New York, NY| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| <p><strong>Explanation:</strong> - <code class="docutils literal notranslate"><span class="pre">header=True</span></code>: Treats the first line as column names. - <code class="docutils literal notranslate"><span class="pre">inferSchema=True</span></code>: Automatically infers data types of columns.</p> |
| </section> |
| <section id="1.2-Reading-JSON-Files"> |
| <h3>1.2 Reading JSON Files<a class="headerlink" href="#1.2-Reading-JSON-Files" title="Permalink to this headline">#</a></h3> |
| <p>Loading JSON files is simple and allows you to handle both single-line and multi-line JSON structures:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">json_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"multiline"</span><span class="p">,</span> <span class="s2">"true"</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">(</span><span class="s2">"../data/employees.json"</span><span class="p">)</span> |
| <span class="n">json_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Location| Role| |
| +-----------+-----------------+-----------------+ |
| | 19238| Seattle, WA| Data Analyst| |
| | 19239| Seattle, WA|Software Engineer| |
| | 19240| Seattle, WA| IT Specialist| |
| | 19241| New York, NY| Data Analyst| |
| | 19242|San Francisco, CA| Recruiter| |
| | 19243| New York, NY| Product Manager| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| <p><strong>Explanation:</strong> - <code class="docutils literal notranslate"><span class="pre">multiline="true"</span></code>: Allows reading multi-line JSON structures.</p> |
| </section> |
| <section id="1.3-Reading-Parquet-Files"> |
| <h3>1.3 Reading Parquet Files<a class="headerlink" href="#1.3-Reading-Parquet-Files" title="Permalink to this headline">#</a></h3> |
| <p>Parquet is a columnar format that supports efficient data compression and encoding:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s2">"../data/employees.parquet"</span><span class="p">)</span> |
| <span class="n">parquet_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Location| Role| |
| +-----------+-----------------+-----------------+ |
| | 19239| Seattle, WA|Software Engineer| |
| | 19243| New York, NY| Product Manager| |
| | 19242|San Francisco, CA| Recruiter| |
| | 19241| New York, NY| Data Analyst| |
| | 19240| Seattle, WA| IT Specialist| |
| | 19238| Seattle, WA| Data Analyst| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| <p><strong>Tip:</strong> Parquet files are highly efficient for storing data due to columnar storage and compression.</p> |
| </section> |
| <section id="1.4-Reading-ORC-Files"> |
| <h3>1.4 Reading ORC Files<a class="headerlink" href="#1.4-Reading-ORC-Files" title="Permalink to this headline">#</a></h3> |
| <p>ORC is another columnar file format, often used in Hadoop environments:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">orc_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s2">"../data/employees.orc"</span><span class="p">)</span> |
| <span class="n">orc_df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |Employee ID| Location| Role| |
| +-----------+-----------------+-----------------+ |
| | 19242|San Francisco, CA| Recruiter| |
| | 19239| Seattle, WA|Software Engineer| |
| | 19240| Seattle, WA| IT Specialist| |
| | 19243| New York, NY| Product Manager| |
| | 19238| Seattle, WA| Data Analyst| |
| | 19241| New York, NY| Data Analyst| |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| <section id="Writing-Data"> |
| <h2>Writing Data<a class="headerlink" href="#Writing-Data" title="Permalink to this headline">#</a></h2> |
| <section id="2.1-Writing-Data-as-CSV"> |
| <h3>2.1 Writing Data as CSV<a class="headerlink" href="#2.1-Writing-Data-as-CSV" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">csv_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s2">"../data/employees_out.csv"</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <p><strong>Explanation:</strong> - <code class="docutils literal notranslate"><span class="pre">mode="overwrite"</span></code>: If the directory exists, it will be replaced. - <code class="docutils literal notranslate"><span class="pre">header=True</span></code>: Writes the column names as the first line.</p> |
| </section> |
| <section id="2.2-Writing-Data-as-Parquet"> |
| <h3>2.2 Writing Data as Parquet<a class="headerlink" href="#2.2-Writing-Data-as-Parquet" title="Permalink to this headline">#</a></h3> |
| <p>Parquet format is recommended for large datasets:</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s2">"../data/employees_out.parquet"</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| </section> |
| <section id="2.3-Writing-Data-as-ORC"> |
| <h3>2.3 Writing Data as ORC<a class="headerlink" href="#2.3-Writing-Data-as-ORC" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">json_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s2">"../data/employees_out.orc"</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <p><strong>Tip:</strong> Parquet and ORC formats are best for efficient storage and quick reads.</p> |
| </section> |
| </section> |
| <section id="Additional-Options-and-Configurations"> |
| <h2>Additional Options and Configurations<a class="headerlink" href="#Additional-Options-and-Configurations" title="Permalink to this headline">#</a></h2> |
| <p>You can customize how data is read and written by using additional options. Here are a few examples:</p> |
| <section id="Custom-Delimiter-in-CSV:"> |
| <h3>Custom Delimiter in CSV:<a class="headerlink" href="#Custom-Delimiter-in-CSV:" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"delimiter"</span><span class="p">,</span> <span class="s2">";"</span><span class="p">)</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s2">"../data/employees.csv"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------------------------------------+ |
| |_c0 | |
| +-------------------------------------+ |
| |Employee ID,Role,Location | |
| |19238,Data Analyst,"Seattle, WA" | |
| |19239,Software Engineer,"Seattle, WA"| |
| |19240,IT Specialist,"Seattle, WA" | |
| |19241,Data Analyst,"New York, NY" | |
| |19242,Recruiter,"San Francisco, CA" | |
| |19243,Product Manager,"New York, NY" | |
| +-------------------------------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Handling-Null-Values:"> |
| <h3>Handling Null Values:<a class="headerlink" href="#Handling-Null-Values:" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"nullValue"</span><span class="p">,</span> <span class="s2">"NULL"</span><span class="p">)</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s2">"../data/employees.csv"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+-----------------+-----------------+ |
| |_c0 |_c1 |_c2 | |
| +-----------+-----------------+-----------------+ |
| |Employee ID|Role |Location | |
| |19238 |Data Analyst |Seattle, WA | |
| |19239 |Software Engineer|Seattle, WA | |
| |19240 |IT Specialist |Seattle, WA | |
| |19241 |Data Analyst |New York, NY | |
| |19242 |Recruiter |San Francisco, CA| |
| |19243 |Product Manager |New York, NY | |
| +-----------+-----------------+-----------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Compression-Options:"> |
| <h3>Compression Options:<a class="headerlink" href="#Compression-Options:" title="Permalink to this headline">#</a></h3> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"compression"</span><span class="p">,</span> <span class="s2">"gzip"</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s2">"../data/employees_out.parquet"</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"overwrite"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <p>See the <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/io.html">PySpark API reference</a> for Input/Output to check all supported functions and options.</p> |
| </section> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="sql.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="../reference/index.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">API Reference</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Reading-Data">Reading Data</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#1.1-Reading-CSV-Files">1.1 Reading CSV Files</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#1.2-Reading-JSON-Files">1.2 Reading JSON Files</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#1.3-Reading-Parquet-Files">1.3 Reading Parquet Files</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#1.4-Reading-ORC-Files">1.4 Reading ORC Files</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Writing-Data">Writing Data</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#2.1-Writing-Data-as-CSV">2.1 Writing Data as CSV</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#2.2-Writing-Data-as-Parquet">2.2 Writing Data as Parquet</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#2.3-Writing-Data-as-ORC">2.3 Writing Data as ORC</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Additional-Options-and-Configurations">Additional Options and Configurations</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Custom-Delimiter-in-CSV:">Custom Delimiter in CSV:</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Handling-Null-Values:">Handling Null Values:</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Compression-Options:">Compression Options:</a></li> |
| </ul> |
| </li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/user_guide/loadandbehold.ipynb.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2025 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |