blob: afe1ead9a82629f093519184968a25a87c9ac62a [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Debugging PySpark &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'development/debugging';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/development/debugging.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Setting up IDEs" href="setting_ide.html" />
<link rel="prev" title="Testing PySpark" href="testing.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "development/debugging.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "development/debugging.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="contributing.html">Contributing to PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="testing.html">Testing PySpark</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Debugging PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="setting_ide.html">Setting up IDEs</a></li>
<li class="toctree-l1"><a class="reference internal" href="errors.html">Error classes in PySpark</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">Development</a></li>
<li class="breadcrumb-item active" aria-current="page">Debugging PySpark</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="debugging-pyspark">
<h1>Debugging PySpark<a class="headerlink" href="#debugging-pyspark" title="Permalink to this headline">#</a></h1>
<p>PySpark uses Spark as an engine. PySpark uses <a class="reference external" href="https://www.py4j.org/">Py4J</a> to leverage Spark to submit and computes the jobs.</p>
<p>On the driver side, PySpark communicates with the driver on JVM by using <a class="reference external" href="https://www.py4j.org/">Py4J</a>.
When <a class="reference internal" href="../reference/pyspark.sql/api/pyspark.sql.SparkSession.html#pyspark.sql.SparkSession" title="pyspark.sql.SparkSession"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession</span></code></a> or <a class="reference internal" href="../reference/api/pyspark.SparkContext.html#pyspark.SparkContext" title="pyspark.SparkContext"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.SparkContext</span></code></a> is created and initialized, PySpark launches a JVM
to communicate.</p>
<p>On the executor side, Python workers execute and handle Python native functions or data. They are not launched if
a PySpark application does not require interaction between Python workers and JVMs. They are lazily launched only when
Python native functions or data have to be handled, for example, when you execute pandas UDFs or
PySpark RDD APIs.</p>
<p>This page focuses on debugging Python side of PySpark on both driver and executor sides instead of focusing on debugging
with JVM. Profiling and debugging JVM is described at <a class="reference external" href="https://spark.apache.org/developer-tools.html">Useful Developer Tools</a>.</p>
<p>Note that,</p>
<ul class="simple">
<li><p>If you are running locally, you can directly debug the driver side via using your IDE without the remote debug feature. Setting PySpark with IDEs is documented <a class="reference internal" href="setting_ide.html#pycharm"><span class="std std-ref">here</span></a>.</p></li>
<li><p><em>There are many other ways of debugging PySpark applications</em>. For example, you can remotely debug by using the open source <a class="reference external" href="https://www.pydev.org/manual_adv_remote_debugger.html">Remote Debugger</a> instead of using PyCharm Professional documented here.</p></li>
</ul>
<section id="remote-debugging-pycharm-professional">
<h2>Remote Debugging (PyCharm Professional)<a class="headerlink" href="#remote-debugging-pycharm-professional" title="Permalink to this headline">#</a></h2>
<p>This section describes remote debugging on both driver and executor sides within a single machine to demonstrate easily.
The ways of debugging PySpark on the executor side is different from doing in the driver. Therefore, they will be demonstrated respectively.
In order to debug PySpark applications on other machines, please refer to the full instructions that are specific
to PyCharm, documented <a class="reference external" href="https://www.jetbrains.com/help/pycharm/remote-debugging-with-product.html">here</a>.</p>
<p>Firstly, choose <strong>Edit Configuration…</strong> from the <em>Run</em> menu. It opens the <strong>Run/Debug Configurations dialog</strong>.
You have to click <code class="docutils literal notranslate"><span class="pre">+</span></code> configuration on the toolbar, and from the list of available configurations, select <strong>Python Debug Server</strong>.
Enter the name of this new configuration, for example, <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code> and also specify the port number, for example <code class="docutils literal notranslate"><span class="pre">12345</span></code>.</p>
<img alt="PyCharm remote debugger setting" src="../_images/pyspark-remote-debug1.png" />
<div class="line-block">
<div class="line">After that, you should install the corresponding version of the <code class="docutils literal notranslate"><span class="pre">pydevd-pycharm</span></code> package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install.</div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>pip install pydevd-pycharm~=&lt;version of PyCharm on the local machine&gt;
</pre></div>
</div>
<section id="driver-side">
<h3>Driver Side<a class="headerlink" href="#driver-side" title="Permalink to this headline">#</a></h3>
<p>To debug on the driver side, your application should be able to connect to the debugging server. Copy and paste the codes
with <code class="docutils literal notranslate"><span class="pre">pydevd_pycharm.settrace</span></code> to the top of your PySpark script. Suppose the script name is <code class="docutils literal notranslate"><span class="pre">app.py</span></code>:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;#======================Copy and paste from the previous dialog===========================</span>
<span class="s2">import pydevd_pycharm</span>
<span class="s2">pydevd_pycharm.settrace(&#39;localhost&#39;, port=12345, stdoutToServer=True, stderrToServer=True)</span>
<span class="s2">#========================================================================================</span>
<span class="s2"># Your PySpark application codes:</span>
<span class="s2">from pyspark.sql import SparkSession</span>
<span class="s2">spark = SparkSession.builder.getOrCreate()</span>
<span class="s2">spark.range(10).show()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>app.py
</pre></div>
</div>
<p>Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p>
<img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" />
<div class="line-block">
<div class="line">After that, submit your application. This will connect to your PyCharm debugging server and enable you to debug on the driver side remotely.</div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>spark-submit<span class="w"> </span>app.py
</pre></div>
</div>
</section>
<section id="executor-side">
<h3>Executor Side<a class="headerlink" href="#executor-side" title="Permalink to this headline">#</a></h3>
<p>To debug on the executor side, prepare a Python file as below in your current working directory.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark import daemon, worker</span>
<span class="s2">def remote_debug_wrapped(*args, **kwargs):</span>
<span class="s2"> #======================Copy and paste from the previous dialog===========================</span>
<span class="s2"> import pydevd_pycharm</span>
<span class="s2"> pydevd_pycharm.settrace(&#39;localhost&#39;, port=12345, stdoutToServer=True, stderrToServer=True)</span>
<span class="s2"> #========================================================================================</span>
<span class="s2"> worker.main(*args, **kwargs)</span>
<span class="s2">daemon.worker_main = remote_debug_wrapped</span>
<span class="s2">if __name__ == &#39;__main__&#39;:</span>
<span class="s2"> daemon.manager()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>remote_debug.py
</pre></div>
</div>
<p>You will use this file as the Python worker in your PySpark applications by using the <code class="docutils literal notranslate"><span class="pre">spark.python.daemon.module</span></code> configuration.
Run the <code class="docutils literal notranslate"><span class="pre">pyspark</span></code> shell with the configuration below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark<span class="w"> </span>--conf<span class="w"> </span>spark.python.daemon.module<span class="o">=</span>remote_debug
</pre></div>
</div>
<p>Now you’re ready to remotely debug. Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p>
<img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" />
<div class="line-block">
<div class="line">After that, run a job that creates Python workers, for example, as below:</div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
</pre></div>
</div>
</section>
</section>
<section id="checking-resource-usage-top-and-ps">
<h2>Checking Resource Usage (<code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code>)<a class="headerlink" href="#checking-resource-usage-top-and-ps" title="Permalink to this headline">#</a></h2>
<p>The Python processes on the driver and executor can be checked via typical ways such as <code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code> commands.</p>
<section id="id2">
<h3>Driver Side<a class="headerlink" href="#id2" title="Permalink to this headline">#</a></h3>
<p>On the driver side, you can get the process id from your PySpark shell easily as below to know the process id and resources.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="p">;</span> <span class="n">os</span><span class="o">.</span><span class="n">getpid</span><span class="p">()</span>
<span class="go">18482</span>
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps<span class="w"> </span>-fe<span class="w"> </span><span class="m">18482</span>
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>UID PID PPID C STIME TTY TIME CMD
000 18482 12345 0 0:00PM ttys001 0:00.00 /.../python
</pre></div>
</div>
</section>
<section id="id3">
<h3>Executor Side<a class="headerlink" href="#id3" title="Permalink to this headline">#</a></h3>
<p>To check on the executor side, you can simply <code class="docutils literal notranslate"><span class="pre">grep</span></code> them to figure out the process
ids and relevant resources because Python workers are forked from <code class="docutils literal notranslate"><span class="pre">pyspark.daemon</span></code>.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps<span class="w"> </span>-fe<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span>pyspark.daemon
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
...
</pre></div>
</div>
</section>
</section>
<section id="profiling-memory-usage-memory-profiler">
<h2>Profiling Memory Usage (Memory Profiler)<a class="headerlink" href="#profiling-memory-usage-memory-profiler" title="Permalink to this headline">#</a></h2>
<p><a class="reference external" href="https://github.com/pythonprofilers/memory_profiler">memory_profiler</a> is one of the profilers that allow you to
check the memory usage line by line.</p>
<section id="id4">
<h3>Driver Side<a class="headerlink" href="#id4" title="Permalink to this headline">#</a></h3>
<p>Unless you are running your driver program in another machine (e.g., YARN cluster mode), this useful tool can be used
to debug the memory usage on driver side easily. Suppose your PySpark script name is <code class="docutils literal notranslate"><span class="pre">profile_memory.py</span></code>.
You can profile it as below.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark.sql import SparkSession</span>
<span class="s2">#===Your function should be decorated with @profile===</span>
<span class="s2">from memory_profiler import profile</span>
<span class="s2">@profile</span>
<span class="s2">#=====================================================</span>
<span class="s2">def my_func():</span>
<span class="s2"> session = SparkSession.builder.getOrCreate()</span>
<span class="s2"> df = session.range(10000)</span>
<span class="s2"> return df.collect()</span>
<span class="s2">if __name__ == &#39;__main__&#39;:</span>
<span class="s2"> my_func()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>profile_memory.py
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>memory_profiler<span class="w"> </span>profile_memory.py
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Filename: profile_memory.py
Line # Mem usage Increment Line Contents
================================================
...
6 def my_func():
7 51.5 MiB 0.6 MiB session = SparkSession.builder.getOrCreate()
8 51.5 MiB 0.0 MiB df = session.range(10000)
9 54.4 MiB 2.8 MiB return df.collect()
</pre></div>
</div>
</section>
<section id="python-pandas-udf">
<h3>Python/Pandas UDF<a class="headerlink" href="#python-pandas-udf" title="Permalink to this headline">#</a></h3>
<p>PySpark provides remote <a class="reference external" href="https://github.com/pythonprofilers/memory_profiler">memory_profiler</a> for
Python/Pandas UDFs. That can be used on editors with line numbers such as Jupyter notebooks. UDFs with iterators as inputs/outputs are not supported.</p>
<p>SparkSession-based memory profiler can be enabled by setting the <a class="reference external" href="https://spark.apache.org/docs/latest/configuration.html#runtime-sql-configuration">Runtime SQL configuration</a>
<code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.udf.profiler</span></code> to <code class="docutils literal notranslate"><span class="pre">memory</span></code>. An example on a Jupyter notebook is as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">add1</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span>
<span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;spark.sql.pyspark.udf.profiler&quot;</span><span class="p">,</span> <span class="s2">&quot;memory&quot;</span><span class="p">)</span>
<span class="n">added</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">add1</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span>
<span class="n">added</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">spark</span><span class="o">.</span><span class="n">profile</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="nb">type</span><span class="o">=</span><span class="s2">&quot;memory&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>The result profile is as shown below.</p>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>============================================================
Profile of UDF&lt;id=2&gt;
============================================================
Filename: ...
Line # Mem usage Increment Occurrences Line Contents
=============================================================
4 974.0 MiB 974.0 MiB 10 @pandas_udf(&quot;long&quot;)
5 def add1(x):
6 974.4 MiB 0.4 MiB 10 return x + 1
</pre></div>
</div>
<p>The UDF IDs can be seen in the query plan, for example, <code class="docutils literal notranslate"><span class="pre">add1(...)#2L</span></code> in <code class="docutils literal notranslate"><span class="pre">ArrowEvalPython</span></code> as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">added</span><span class="o">.</span><span class="n">explain</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>== Physical Plan ==
*(2) Project [pythonUDF0#11L AS add1(id)#3L]
+- ArrowEvalPython [add1(id#0L)#2L], [pythonUDF0#11L], 200
+- *(1) Range (0, 10, step=1, splits=16)
</pre></div>
</div>
<p>We can clear the result memory profile as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">profile</span><span class="o">.</span><span class="n">clear</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s2">&quot;memory&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="identifying-hot-loops-python-profilers">
<h2>Identifying Hot Loops (Python Profilers)<a class="headerlink" href="#identifying-hot-loops-python-profilers" title="Permalink to this headline">#</a></h2>
<p><a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> are useful built-in features in Python itself. These
provide deterministic profiling of Python programs with a lot of useful statistics. This section describes how to use it on
both driver and executor sides in order to identify expensive or hot code paths.</p>
<section id="id6">
<h3>Driver Side<a class="headerlink" href="#id6" title="Permalink to this headline">#</a></h3>
<p>To use this on driver side, you can use it as you would do for regular Python programs because PySpark on driver side is a
regular Python process unless you are running your driver program in another machine (e.g., YARN cluster mode).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark.sql import SparkSession</span>
<span class="s2">spark = SparkSession.builder.getOrCreate()</span>
<span class="s2">spark.range(10).show()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>app.py
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>cProfile<span class="w"> </span>app.py
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>...
129215 function calls (125446 primitive calls) in 5.926 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1198/405 0.001 0.000 0.083 0.000 &lt;frozen importlib._bootstrap&gt;:1009(_handle_fromlist)
561 0.001 0.000 0.001 0.000 &lt;frozen importlib._bootstrap&gt;:103(release)
276 0.000 0.000 0.000 0.000 &lt;frozen importlib._bootstrap&gt;:143(__init__)
276 0.000 0.000 0.002 0.000 &lt;frozen importlib._bootstrap&gt;:147(__enter__)
...
</pre></div>
</div>
</section>
<section id="id7">
<h3>Python/Pandas UDF<a class="headerlink" href="#id7" title="Permalink to this headline">#</a></h3>
<p>PySpark provides remote <a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> for
Python/Pandas UDFs. UDFs with iterators as inputs/outputs are not supported.</p>
<p>SparkSession-based performance profiler can be enabled by setting the <a class="reference external" href="https://spark.apache.org/docs/latest/configuration.html#runtime-sql-configuration">Runtime SQL configuration</a>
<code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.udf.profiler</span></code> to <code class="docutils literal notranslate"><span class="pre">perf</span></code>. An example is as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">add1</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">added</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">add1</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;spark.sql.pyspark.udf.profiler&quot;</span><span class="p">,</span> <span class="s2">&quot;perf&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">added</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+--------+</span>
<span class="go">|add1(id)|</span>
<span class="go">+--------+</span>
<span class="go">...</span>
<span class="go">+--------+</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">profile</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="nb">type</span><span class="o">=</span><span class="s2">&quot;perf&quot;</span><span class="p">)</span>
<span class="go">============================================================</span>
<span class="go">Profile of UDF&lt;id=2&gt;</span>
<span class="go">============================================================</span>
<span class="go"> 2300 function calls (2270 primitive calls) in 0.006 seconds</span>
<span class="go"> Ordered by: internal time, cumulative time</span>
<span class="go"> ncalls tottime percall cumtime percall filename:lineno(function)</span>
<span class="go"> 10 0.001 0.000 0.005 0.001 series.py:5515(_arith_method)</span>
<span class="go"> 10 0.001 0.000 0.001 0.000 _ufunc_config.py:425(__init__)</span>
<span class="go"> 10 0.000 0.000 0.000 0.000 {built-in method _operator.add}</span>
<span class="go"> 10 0.000 0.000 0.002 0.000 series.py:315(__init__)</span>
<span class="go">...</span>
</pre></div>
</div>
<p>The UDF IDs can be seen in the query plan, for example, <code class="docutils literal notranslate"><span class="pre">add1(...)#2L</span></code> in <code class="docutils literal notranslate"><span class="pre">ArrowEvalPython</span></code> below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">added</span><span class="o">.</span><span class="n">explain</span><span class="p">()</span>
<span class="go">== Physical Plan ==</span>
<span class="go">*(2) Project [pythonUDF0#11L AS add1(id)#3L]</span>
<span class="go">+- ArrowEvalPython [add1(id#0L)#2L], [pythonUDF0#11L], 200</span>
<span class="go"> +- *(1) Range (0, 10, step=1, splits=16)</span>
</pre></div>
</div>
<p>We can clear the result performance profile as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">profile</span><span class="o">.</span><span class="n">clear</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s2">&quot;perf&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="common-exceptions-errors">
<h2>Common Exceptions / Errors<a class="headerlink" href="#common-exceptions-errors" title="Permalink to this headline">#</a></h2>
<section id="pyspark-sql">
<h3>PySpark SQL<a class="headerlink" href="#pyspark-sql" title="Permalink to this headline">#</a></h3>
<p><strong>AnalysisException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">AnalysisException</span></code> is raised when failing to analyze a SQL query plan.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;bad_key&#39;</span><span class="p">]</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.AnalysisException</span>: <span class="n">Cannot resolve column name &quot;bad_key&quot; among (id)</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;id&#39;</span><span class="p">]</span>
<span class="go">Column&lt;&#39;id&#39;&gt;</span>
</pre></div>
</div>
<p><strong>ParseException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">ParseException</span></code> is raised when failing to parse a SQL command.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;select * 1&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.ParseException</span><span class="w">:</span>
<span class="x">[PARSE_SYNTAX_ERROR] Syntax error at or near &#39;1&#39;: extra input &#39;1&#39;.(line 1, pos 9)</span>
<span class="x">== SQL ==</span>
<span class="x">select * 1</span>
<span class="x">---------^^^</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;select *&quot;</span><span class="p">)</span>
<span class="go">DataFrame[]</span>
</pre></div>
</div>
<p><strong>IllegalArgumentException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">IllegalArgumentException</span></code> is raised when passing an illegal or inappropriate argument.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.IllegalArgumentException</span>: <span class="n">requirement failed: Sampling fraction (-1.0) must be on interval [0, 1] without replacement</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span>
<span class="go">DataFrame[id: bigint]</span>
</pre></div>
</div>
<p><strong>PythonException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">PythonException</span></code> is thrown from Python workers.</p>
<p>You can see the type of exception that was thrown from the Python worker and its stack trace, as <code class="docutils literal notranslate"><span class="pre">TypeError</span></code> below.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="k">as</span> <span class="nn">sf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">sf</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">udf</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">22/04/12 14:52:31 ERROR Executor: Exception in task 7.0 in stage 37.0 (TID 232)</span>
<span class="go">org.apache.spark.api.python.PythonException: Traceback (most recent call last):</span>
<span class="go">...</span>
<span class="go">TypeError: Invalid argument, not a string or column: -1 of type &lt;class &#39;int&#39;&gt;. For column literals, use &#39;lit&#39;, &#39;array&#39;, &#39;struct&#39; or &#39;create_map&#39; function.</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="nb">abs</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">udf</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(id=-1, abs=&#39;1&#39;), Row(id=0, abs=&#39;0&#39;)]</span>
</pre></div>
</div>
<p><strong>StreamingQueryException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">StreamingQueryException</span></code> is raised when failing a StreamingQuery. Most often, it is thrown from Python workers, that wrap it as a <code class="docutils literal notranslate"><span class="pre">PythonException</span></code>.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sdf</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">readStream</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;text&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s2">&quot;python/test_support/sql/streaming&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bad_udf</span> <span class="o">=</span> <span class="n">udf</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="mi">1</span> <span class="o">/</span> <span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">bad_udf</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;value&quot;</span><span class="p">)))</span><span class="o">.</span><span class="n">writeStream</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;memory&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">queryName</span><span class="p">(</span><span class="s2">&quot;q1&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">start</span><span class="p">())</span><span class="o">.</span><span class="n">processAllAvailable</span><span class="p">()</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">org.apache.spark.api.python.PythonException</span>: <span class="n">Traceback (most recent call last):</span>
File <span class="nb">&quot;&lt;stdin&gt;&quot;</span>, line <span class="m">1</span>, in <span class="n">&lt;lambda&gt;</span>
<span class="gr">ZeroDivisionError</span>: <span class="n">division by zero</span>
<span class="x">...</span>
<span class="x">pyspark.errors.exceptions.StreamingQueryException: [STREAM_FAILED] Query [id = 74eb53a8-89bd-49b0-9313-14d29eed03aa, runId = 9f2d5cf6-a373-478d-b718-2c2b6d8a0f24] terminated with exception: Job aborted</span>
</pre></div>
</div>
<p>Solution:</p>
<p>Fix the StreamingQuery and re-execute the workflow.</p>
<p><strong>SparkUpgradeException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">SparkUpgradeException</span></code> is thrown because of Spark upgrade.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">to_date</span><span class="p">,</span> <span class="n">unix_timestamp</span><span class="p">,</span> <span class="n">from_unixtime</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;2014-31-12&quot;</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;date_str&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="n">to_date</span><span class="p">(</span><span class="n">from_unixtime</span><span class="p">(</span><span class="n">unix_timestamp</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="s2">&quot;yyyy-dd-aa&quot;</span><span class="p">))))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.sql.utils.SparkUpgradeException</span>: <span class="n">You may get a different result due to the upgrading to Spark &gt;= 3.0: Fail to recognize &#39;yyyy-dd-aa&#39; pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;spark.sql.legacy.timeParserPolicy&quot;</span><span class="p">,</span> <span class="s2">&quot;LEGACY&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="n">to_date</span><span class="p">(</span><span class="n">from_unixtime</span><span class="p">(</span><span class="n">unix_timestamp</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="s2">&quot;yyyy-dd-aa&quot;</span><span class="p">))))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(date_str=&#39;2014-31-12&#39;, to_date(from_unixtime(unix_timestamp(date_str, yyyy-dd-aa), yyyy-MM-dd HH:mm:ss))=None)]</span>
</pre></div>
</div>
</section>
<section id="pandas-api-on-spark">
<h3>pandas API on Spark<a class="headerlink" href="#pandas-api-on-spark" title="Permalink to this headline">#</a></h3>
<p>There are specific common exceptions / errors in pandas API on Spark.</p>
<p><strong>ValueError: Cannot combine the series or dataframe because it comes from a different dataframe</strong></p>
<p>Operations involving more than one series or dataframes raises a <code class="docutils literal notranslate"><span class="pre">ValueError</span></code> if <code class="docutils literal notranslate"><span class="pre">compute.ops_on_diff_frames</span></code> is disabled (disabled by default). Such operations may be expensive due to joining of underlying Spark frames. So users should be aware of the cost and enable that flag only when necessary.</p>
<p>Exception:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">ValueError</span>: <span class="n">Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable &#39;compute.ops_on_diff_frames&#39; option.</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s1">&#39;compute.ops_on_diff_frames&#39;</span><span class="p">,</span> <span class="kc">True</span><span class="p">):</span>
<span class="gp">... </span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
<span class="gp">...</span>
<span class="go">0 4</span>
<span class="go">1 6</span>
<span class="go">dtype: int64</span>
</pre></div>
</div>
<p><strong>RuntimeError: Result vector from pandas_udf was not the required length</strong></p>
<p>Exception:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;x&quot;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s2">&quot;y&quot;</span><span class="p">:[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="go">22/04/12 13:46:39 ERROR Executor: Exception in task 2.0 in stage 16.0 (TID 88)</span>
<span class="go">org.apache.spark.api.python.PythonException: Traceback (most recent call last):</span>
<span class="go">...</span>
<span class="go">RuntimeError: Result vector from pandas_udf was not the required length: expected 1, got 0</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;x&quot;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s2">&quot;y&quot;</span><span class="p">:[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="go"> x y</span>
<span class="go">0 1 3</span>
<span class="go">1 2 4</span>
</pre></div>
</div>
</section>
<section id="id10">
<h3>Py4j<a class="headerlink" href="#id10" title="Permalink to this headline">#</a></h3>
<p><strong>Py4JJavaError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JJavaError</span></code> is raised when an exception occurs in the Java client code.
You can see the type of exception that was thrown on the Java side and its stack trace, as <code class="docutils literal notranslate"><span class="pre">java.lang.NullPointerException</span></code> below.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">py4j.protocol.Py4JJavaError</span>: <span class="n">An error occurred while calling None.java.lang.String.</span>
<span class="x">: java.lang.NullPointerException</span>
<span class="x">..</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">)</span>
<span class="go">&#39;x&#39;</span>
</pre></div>
</div>
<p><strong>Py4JError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JError</span></code> is raised when any other error occurs such as when the Python client program tries to access an object that no longer exists on the Java side.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.regression</span> <span class="kn">import</span> <span class="n">LinearRegression</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;weight&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">],</span>
<span class="gp">... </span> <span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LinearRegression</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">solver</span><span class="o">=</span><span class="s2">&quot;normal&quot;</span><span class="p">,</span> <span class="n">weightCol</span><span class="o">=</span><span class="s2">&quot;weight&quot;</span><span class="p">,</span> <span class="n">fitIntercept</span><span class="o">=</span><span class="kc">False</span>
<span class="gp">... </span> <span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span>
<span class="go">LinearRegressionModel: uid=LinearRegression_eb7bc1d4bf25, numFeatures=1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="fm">__del__</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">py4j.protocol.Py4JError</span>: <span class="n">An error occurred while calling o531.toString. Trace:</span>
<span class="x">py4j.Py4JException: Target Object ID does not exist for this gateway :o531</span>
<span class="x">...</span>
</pre></div>
</div>
<p>Solution:</p>
<p>Access an object that exists on the Java side.</p>
<p><strong>Py4JNetworkError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JNetworkError</span></code> is raised when a problem occurs during network transfer (e.g., connection lost). In this case, we shall debug the network and rebuild the connection.</p>
</section>
</section>
<section id="stack-traces">
<h2>Stack Traces<a class="headerlink" href="#stack-traces" title="Permalink to this headline">#</a></h2>
<p>There are Spark configurations to control stack traces:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled</span></code> is true by default to simplify traceback from Python UDFs.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.jvmStacktrace.enabled</span></code> is false by default to hide JVM stacktrace and to show a Python-friendly exception only.</p></li>
</ul>
<p>Spark configurations above are independent from log level settings. Control log levels through <a class="reference internal" href="../reference/api/pyspark.SparkContext.setLogLevel.html#pyspark.SparkContext.setLogLevel" title="pyspark.SparkContext.setLogLevel"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.setLogLevel()</span></code></a>.</p>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="testing.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Testing PySpark</p>
</div>
</a>
<a class="right-next"
href="setting_ide.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Setting up IDEs</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#remote-debugging-pycharm-professional">Remote Debugging (PyCharm Professional)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#driver-side">Driver Side</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#executor-side">Executor Side</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#checking-resource-usage-top-and-ps">Checking Resource Usage (<code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code>)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">Driver Side</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id3">Executor Side</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#profiling-memory-usage-memory-profiler">Profiling Memory Usage (Memory Profiler)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id4">Driver Side</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#python-pandas-udf">Python/Pandas UDF</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#identifying-hot-loops-python-profilers">Identifying Hot Loops (Python Profilers)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id6">Driver Side</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id7">Python/Pandas UDF</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-exceptions-errors">Common Exceptions / Errors</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#pyspark-sql">PySpark SQL</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#pandas-api-on-spark">pandas API on Spark</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id10">Py4j</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#stack-traces">Stack Traces</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/development/debugging.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>