blob: a72af6736b831f4ce60b92e51b1facb75cb49983 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Chapter 6: Old SQL, New Tricks - Running SQL on PySpark &#8212; PySpark 4.1.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
<script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/sql';</script>
<script>
DOCUMENTATION_OPTIONS.theme_switcher_json_url = 'https://spark.apache.org/static/versions.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '4.1.0-preview1';
</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/sql.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Chapter 7: Load and Behold - Data loading, storage, file formats" href="loadandbehold.html" />
<link rel="prev" title="Chapter 5: Unleashing UDFs &amp; UDTFs" href="udfandudtf.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../tutorial/index.html">
Tutorials
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guide
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<div class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
More
</button>
<div class="dropdown-menu">
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</div>
</div>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown">
4.1.0-preview1 <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div class="version-switcher__menu dropdown-menu list-group-flush py-0">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../tutorial/index.html">
Tutorials
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guide
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<div class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
More
</button>
<div class="dropdown-menu">
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</div>
</div>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown">
4.1.0-preview1 <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div class="version-switcher__menu dropdown-menu list-group-flush py-0">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="dataframes.html">Chapter 1: DataFrames - A view into your structured data</a></li>
<li class="toctree-l1"><a class="reference internal" href="touroftypes.html">Chapter 2: A Tour of PySpark Data Types</a></li>
<li class="toctree-l1"><a class="reference internal" href="dataprep.html">Chapter 3: Function Junction - Data manipulation with PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="bugbusting.html">Chapter 4: Bug Busting - Debugging PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="udfandudtf.html">Chapter 5: Unleashing UDFs &amp; UDTFs</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="loadandbehold.html">Chapter 7: Load and Behold - Data loading, storage, file formats</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guide</a></li>
<li class="breadcrumb-item active" aria-current="page">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">warnings</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">filterwarnings</span><span class="p">(</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">setLogLevel</span><span class="p">(</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<section id="Chapter-6:-Old-SQL,-New-Tricks---Running-SQL-on-PySpark">
<h1>Chapter 6: Old SQL, New Tricks - Running SQL on PySpark<a class="headerlink" href="#Chapter-6:-Old-SQL,-New-Tricks---Running-SQL-on-PySpark" title="Permalink to this headline">#</a></h1>
<section id="Introduction">
<h2>Introduction<a class="headerlink" href="#Introduction" title="Permalink to this headline">#</a></h2>
<p>This section explains how to use the Spark SQL API in PySpark and compare it with the DataFrame API. It also covers how to switch between the two APIs seamlessly, along with some practical tips and tricks.</p>
</section>
<section id="Running-SQL-with-PySpark">
<h2>Running SQL with PySpark<a class="headerlink" href="#Running-SQL-with-PySpark" title="Permalink to this headline">#</a></h2>
<p>PySpark offers two main ways to perform SQL operations:</p>
<section id="Using-spark.sql()">
<h3>Using <code class="docutils literal notranslate"><span class="pre">spark.sql()</span></code><a class="headerlink" href="#Using-spark.sql()" title="Permalink to this headline">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">spark.sql()</span></code> function allows you to execute SQL queries directly.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create a table via spark.sql()</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP TABLE IF EXISTS people&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">CREATE TABLE people USING PARQUET</span>
<span class="s2">AS SELECT * FROM VALUES (1, &#39;Alice&#39;, 10), (2, &#39;Bob&#39;, 20), (3, &#39;Charlie&#39;, 30) t(id, name, age)</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
DataFrame[]
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use spark.sql() to select data from a table</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT name, age FROM people WHERE age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+---+
| name|age|
+-------+---+
|Charlie| 30|
+-------+---+
</pre></div></div>
</div>
</section>
<section id="Using-the-PySpark-DataFrame-API">
<h3>Using the PySpark DataFrame API<a class="headerlink" href="#Using-the-PySpark-DataFrame-API" title="Permalink to this headline">#</a></h3>
<p>The PySpark DataFrame API provides equivalent functionality to SQL but with a Pythonic approach.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Read a table using the DataFrame API</span>
<span class="n">people_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;people&quot;</span><span class="p">)</span>
<span class="c1"># Use DataFrame API to select data</span>
<span class="n">people_df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;age&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+---+
| name|age|
+-------+---+
|Charlie| 30|
+-------+---+
</pre></div></div>
</div>
</section>
</section>
<section id="SQL-vs.-DataFrame-API-in-PySpark">
<h2>SQL vs. DataFrame API in PySpark<a class="headerlink" href="#SQL-vs.-DataFrame-API-in-PySpark" title="Permalink to this headline">#</a></h2>
<p>When to use which API depends on your background and the specific task:</p>
<p><strong>SQL API:</strong> - Ideal for users with SQL backgrounds who are more comfortable writing SQL queries.</p>
<p><strong>DataFrame API:</strong> - Preferred by Python developers as it aligns with Python syntax and idioms. - Provides greater flexibility for complex transformations, especially with user-defined functions (UDFs).</p>
<section id="Code-Examples:-SQL-vs.-DataFrame-API">
<h3>Code Examples: SQL vs. DataFrame API<a class="headerlink" href="#Code-Examples:-SQL-vs.-DataFrame-API" title="Permalink to this headline">#</a></h3>
<p>Here are some examples comparing how common tasks are performed using the SQL API and PySpark’s DataFrame API to give you an idea of their differences and when one might be more suitable than the other.</p>
<section id="Example:-SELECT-and-FILTER-Operation">
<h4>Example: SELECT and FILTER Operation<a class="headerlink" href="#Example:-SELECT-and-FILTER-Operation" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT name FROM people WHERE age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+
| name|
+-------+
|Charlie|
+-------+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;people&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+
| name|
+-------+
|Charlie|
+-------+
</pre></div></div>
</div>
</section>
<section id="Example:-JOIN-Operation">
<h4>Example: JOIN Operation<a class="headerlink" href="#Example:-JOIN-Operation" title="Permalink to this headline">#</a></h4>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP TABLE IF EXISTS orders&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">CREATE TABLE orders USING PARQUET</span>
<span class="s2">AS SELECT * FROM VALUES (101, 1, 200), (102, 2, 150), (103,3, 300) t(order_id, customer_id, amount)</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
DataFrame[]
</pre></div></div>
</div>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[19]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">SELECT p.name, o.order_id</span>
<span class="s2">FROM people p</span>
<span class="s2">JOIN orders o ON p.id = o.customer_id</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+--------+
| name|order_id|
+-------+--------+
|Charlie| 103|
| Alice| 101|
| Bob| 102|
+-------+--------+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[20]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">people_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;people&quot;</span><span class="p">)</span>
<span class="n">orders_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;orders&quot;</span><span class="p">)</span>
<span class="p">(</span>
<span class="n">people_df</span>
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">orders_df</span><span class="p">,</span> <span class="n">people_df</span><span class="o">.</span><span class="n">id</span> <span class="o">==</span> <span class="n">orders_df</span><span class="o">.</span><span class="n">customer_id</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">people_df</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">orders_df</span><span class="o">.</span><span class="n">order_id</span><span class="p">)</span>
<span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+--------+
| name|order_id|
+-------+--------+
|Charlie| 103|
| Alice| 101|
| Bob| 102|
+-------+--------+
</pre></div></div>
</div>
</section>
<section id="Example:-GROUP-BY-and-Aggregate-Operation">
<h4>Example: GROUP BY and Aggregate Operation<a class="headerlink" href="#Example:-GROUP-BY-and-Aggregate-Operation" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[21]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">SELECT p.name, SUM(o.amount) AS total_amount</span>
<span class="s2">FROM people p</span>
<span class="s2">JOIN orders o ON p.id = o.customer_id</span>
<span class="s2">GROUP BY p.name</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+------------+
| name|total_amount|
+-------+------------+
|Charlie| 300|
| Alice| 200|
| Bob| 150|
+-------+------------+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[22]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="nb">sum</span>
<span class="p">(</span>
<span class="n">people_df</span>
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">orders_df</span><span class="p">,</span> <span class="n">people_df</span><span class="o">.</span><span class="n">id</span> <span class="o">==</span> <span class="n">orders_df</span><span class="o">.</span><span class="n">customer_id</span><span class="p">)</span>
<span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="s2">&quot;amount&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;total_amount&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+------------+
| name|total_amount|
+-------+------------+
|Charlie| 300|
| Alice| 200|
| Bob| 150|
+-------+------------+
</pre></div></div>
</div>
</section>
<section id="Example:-Window-Operations">
<h4>Example: Window Operations<a class="headerlink" href="#Example:-Window-Operations" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[23]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">SELECT</span>
<span class="s2"> p.name,</span>
<span class="s2"> o.amount,</span>
<span class="s2"> RANK() OVER (PARTITION BY p.name ORDER BY o.amount DESC) AS rank</span>
<span class="s2">FROM people p</span>
<span class="s2">JOIN orders o ON p.id = o.customer_id</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+------+----+
| name|amount|rank|
+-------+------+----+
| Alice| 200| 1|
| Bob| 150| 1|
|Charlie| 300| 1|
+-------+------+----+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[24]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.window</span><span class="w"> </span><span class="kn">import</span> <span class="n">Window</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">rank</span>
<span class="c1"># Define the window specification</span>
<span class="n">window_spec</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">orders_df</span><span class="o">.</span><span class="n">amount</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="c1"># Window operation with RANK</span>
<span class="p">(</span>
<span class="n">people_df</span>
<span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">orders_df</span><span class="p">,</span> <span class="n">people_df</span><span class="o">.</span><span class="n">id</span> <span class="o">==</span> <span class="n">orders_df</span><span class="o">.</span><span class="n">customer_id</span><span class="p">)</span>
<span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;rank&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window_spec</span><span class="p">))</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;amount&quot;</span><span class="p">,</span> <span class="s2">&quot;rank&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+------+----+
| name|amount|rank|
+-------+------+----+
| Alice| 200| 1|
| Bob| 150| 1|
|Charlie| 300| 1|
+-------+------+----+
</pre></div></div>
</div>
</section>
<section id="Example:-UNION-Operation">
<h4>Example: UNION Operation<a class="headerlink" href="#Example:-UNION-Operation" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong> - The <code class="docutils literal notranslate"><span class="pre">UNION</span></code> operator combines rows from two queries and removes duplicates by default.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[25]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;CREATE OR REPLACE TEMP VIEW people2 AS SELECT * FROM VALUES (1, &#39;Alice&#39;, 10), (4, &#39;David&#39;, 35) t(id, name, age)&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[25]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
DataFrame[]
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[26]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2">SELECT * FROM people</span>
<span class="s2">UNION</span>
<span class="s2">SELECT * FROM people2</span>
<span class="s2">&quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+
| id| name|age|
+---+-------+---+
| 3|Charlie| 30|
| 1| Alice| 10|
| 2| Bob| 20|
| 4| David| 35|
+---+-------+---+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong> - The <code class="docutils literal notranslate"><span class="pre">union()</span></code> method is used to combine two DataFrames, but it does not remove duplicates by default. - To match the behavior of SQL’s UNION, we use the .dropDuplicates() method to eliminate duplicates after the union operation.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[27]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">people_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;people&quot;</span><span class="p">)</span>
<span class="n">people2_df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;people2&quot;</span><span class="p">)</span>
<span class="c1"># This will have duplicate values.</span>
<span class="n">people_df</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">people2_df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+
| id| name|age|
+---+-------+---+
| 3|Charlie| 30|
| 1| Alice| 10|
| 2| Bob| 20|
| 1| Alice| 10|
| 4| David| 35|
+---+-------+---+
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[28]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Remove duplicate values</span>
<span class="n">people_df</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">people2_df</span><span class="p">)</span><span class="o">.</span><span class="n">dropDuplicates</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+
| id| name|age|
+---+-------+---+
| 3|Charlie| 30|
| 1| Alice| 10|
| 2| Bob| 20|
| 4| David| 35|
+---+-------+---+
</pre></div></div>
</div>
</section>
<section id="Example:-SET-Configurations">
<h4>Example: SET Configurations<a class="headerlink" href="#Example:-SET-Configurations" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[29]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SET spark.sql.shuffle.partitions=8&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[29]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
DataFrame[key: string, value: string]
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[30]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SET spark.sql.shuffle.partitions&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+----------------------------+-----+
|key |value|
+----------------------------+-----+
|spark.sql.shuffle.partitions|8 |
+----------------------------+-----+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[31]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;spark.sql.shuffle.partitions&quot;</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[32]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.sql.shuffle.partitions&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[32]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
&#39;10&#39;
</pre></div></div>
</div>
</section>
<section id="Example:-Listing-Tables-and-Views">
<h4>Example: Listing Tables and Views<a class="headerlink" href="#Example:-Listing-Tables-and-Views" title="Permalink to this headline">#</a></h4>
<p><strong>SQL API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[33]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SHOW TABLES&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| default| orders| false|
| default| people| false|
| | people2| true|
+---------+---------+-----------+
</pre></div></div>
</div>
<p><strong>DataFrame API:</strong></p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[34]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">tables</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">catalog</span><span class="o">.</span><span class="n">listTables</span><span class="p">()</span>
<span class="k">for</span> <span class="n">table</span> <span class="ow">in</span> <span class="n">tables</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Name: </span><span class="si">{</span><span class="n">table</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">, isTemporary: </span><span class="si">{</span><span class="n">table</span><span class="o">.</span><span class="n">isTemporary</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Name: orders, isTemporary: False
Name: people, isTemporary: False
Name: people2, isTemporary: True
</pre></div></div>
</div>
</section>
</section>
<section id="DataFrame-API-Exclusive-Functions">
<h3>DataFrame API Exclusive Functions<a class="headerlink" href="#DataFrame-API-Exclusive-Functions" title="Permalink to this headline">#</a></h3>
<p>Certain operations are exclusive to the DataFrame API and are not supported in SQL, such as:</p>
<p><strong>withColumn</strong>: Adds or modifies columns in a DataFrame.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[35]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">people_df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;new_col&quot;</span><span class="p">,</span> <span class="n">people_df</span><span class="p">[</span><span class="s2">&quot;age&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+-------+
| id| name|age|new_col|
+---+-------+---+-------+
| 3|Charlie| 30| 40|
| 1| Alice| 10| 20|
| 2| Bob| 20| 30|
+---+-------+---+-------+
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[39]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">people_df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;age&quot;</span><span class="p">,</span> <span class="n">people_df</span><span class="p">[</span><span class="s2">&quot;age&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+
| id| name|age|
+---+-------+---+
| 3|Charlie| 40|
| 1| Alice| 20|
| 2| Bob| 30|
+---+-------+---+
</pre></div></div>
</div>
</section>
</section>
<section id="Using-SQL-and-DataFrame-API-Interchangeably">
<h2>Using SQL and DataFrame API Interchangeably<a class="headerlink" href="#Using-SQL-and-DataFrame-API-Interchangeably" title="Permalink to this headline">#</a></h2>
<p>PySpark supports switching between SQL and DataFrame API, making it easy to mix and match.</p>
<section id="Chaining-DataFrame-Operations-on-SQL-Outputs">
<h3>Chaining DataFrame Operations on SQL Outputs<a class="headerlink" href="#Chaining-DataFrame-Operations-on-SQL-Outputs" title="Permalink to this headline">#</a></h3>
<p>PySpark’s DataFrame API allows you to chain multiple operations together to create efficient and readable transformations.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[36]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Chaining DataFrame operations on SQL results</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT name, age FROM people&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">&quot;age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+---+
| name|age|
+-------+---+
|Charlie| 30|
+-------+---+
</pre></div></div>
</div>
</section>
<section id="Using-selectExpr()">
<h3>Using <code class="docutils literal notranslate"><span class="pre">selectExpr()</span></code><a class="headerlink" href="#Using-selectExpr()" title="Permalink to this headline">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">selectExpr()</span></code> method allows you to run SQL expressions within the DataFrame API.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[37]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">people_df</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;age + 1 AS age_plus_one&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+------------+
| name|age_plus_one|
+-------+------------+
|Charlie| 31|
| Alice| 11|
| Bob| 21|
+-------+------------+
</pre></div></div>
</div>
</section>
<section id="Querying-a-DataFrame-in-SQL">
<h3>Querying a DataFrame in SQL<a class="headerlink" href="#Querying-a-DataFrame-in-SQL" title="Permalink to this headline">#</a></h3>
<p>You can create a temporary view from a DataFrame and run SQL queries on it.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[38]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># First create a temp view on top of the DataFrame.</span>
<span class="n">people_df</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s2">&quot;people_view&quot;</span><span class="p">)</span>
<span class="c1"># Then it can be referenced in SQL.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM people_view WHERE age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+---+-------+---+
| id| name|age|
+---+-------+---+
| 3|Charlie| 30|
+---+-------+---+
</pre></div></div>
</div>
</section>
<section id="Use-Python-User-Defined-Functions-in-SQL">
<h3>Use Python User-Defined Functions in SQL<a class="headerlink" href="#Use-Python-User-Defined-Functions-in-SQL" title="Permalink to this headline">#</a></h3>
<p>You can register Python user-defined functions (UDFs) for use within SQL queries, enabling custom transformations within SQL syntax.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[41]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.types</span><span class="w"> </span><span class="kn">import</span> <span class="n">StringType</span>
<span class="c1"># Define the UDF</span>
<span class="nd">@udf</span><span class="p">(</span><span class="s2">&quot;string&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">uppercase_name</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">return</span> <span class="n">name</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span>
<span class="c1"># Register the UDF</span>
<span class="n">spark</span><span class="o">.</span><span class="n">udf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;uppercase_name&quot;</span><span class="p">,</span> <span class="n">uppercase_name</span><span class="p">)</span>
<span class="c1"># Use it in SQL</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT name, uppercase_name(name) FROM people_view WHERE age &gt; 21&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
+-------+--------------------+
| name|uppercase_name(name)|
+-------+--------------------+
|Charlie| CHARLIE|
+-------+--------------------+
</pre></div></div>
</div>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span>
</pre></div>
</div>
</div>
</section>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="udfandudtf.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Chapter 5: Unleashing UDFs &amp; UDTFs</p>
</div>
</a>
<a class="right-next"
href="loadandbehold.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Chapter 7: Load and Behold - Data loading, storage, file formats</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Introduction">Introduction</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Running-SQL-with-PySpark">Running SQL with PySpark</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Using-spark.sql()">Using <code class="docutils literal notranslate"><span class="pre">spark.sql()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Using-the-PySpark-DataFrame-API">Using the PySpark DataFrame API</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#SQL-vs.-DataFrame-API-in-PySpark">SQL vs. DataFrame API in PySpark</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Code-Examples:-SQL-vs.-DataFrame-API">Code Examples: SQL vs. DataFrame API</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-SELECT-and-FILTER-Operation">Example: SELECT and FILTER Operation</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-JOIN-Operation">Example: JOIN Operation</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-GROUP-BY-and-Aggregate-Operation">Example: GROUP BY and Aggregate Operation</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-Window-Operations">Example: Window Operations</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-UNION-Operation">Example: UNION Operation</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-SET-Configurations">Example: SET Configurations</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example:-Listing-Tables-and-Views">Example: Listing Tables and Views</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#DataFrame-API-Exclusive-Functions">DataFrame API Exclusive Functions</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Using-SQL-and-DataFrame-API-Interchangeably">Using SQL and DataFrame API Interchangeably</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Chaining-DataFrame-Operations-on-SQL-Outputs">Chaining DataFrame Operations on SQL Outputs</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Using-selectExpr()">Using <code class="docutils literal notranslate"><span class="pre">selectExpr()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Querying-a-DataFrame-in-SQL">Querying a DataFrame in SQL</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Use-Python-User-Defined-Functions-in-SQL">Use Python User-Defined Functions in SQL</a></li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/user_guide/sql.ipynb.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2025 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>