blob: d3c280e77f0f2e3419af557d9c0564961cfe35c0 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>Debugging PySpark &#8212; PySpark 3.5.4 documentation</title>
<link href="../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/development/debugging.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Setting up IDEs" href="setting_ide.html" />
<link rel="prev" title="Testing PySpark" href="testing.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../index.html">
<img src="../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../index.html">
Overview
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 current active nav-item">
<a class="reference internal nav-link" href="index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
3.5.4
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "development/debugging.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="current nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="contributing.html">
Contributing to PySpark
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="testing.html">
Testing PySpark
</a>
</li>
<li class="toctree-l1 current active">
<a class="current reference internal" href="#">
Debugging PySpark
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="setting_ide.html">
Setting up IDEs
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="errors.html">
Error classes in PySpark
</a>
</li>
</ul>
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#remote-debugging-pycharm-professional">
Remote Debugging (PyCharm Professional)
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#driver-side">
Driver Side
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#executor-side">
Executor Side
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#checking-resource-usage-top-and-ps">
Checking Resource Usage (
<code class="docutils literal notranslate">
<span class="pre">
top
</span>
</code>
and
<code class="docutils literal notranslate">
<span class="pre">
ps
</span>
</code>
)
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id2">
Driver Side
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id3">
Executor Side
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#profiling-memory-usage-memory-profiler">
Profiling Memory Usage (Memory Profiler)
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id4">
Driver Side
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#python-pandas-udf">
Python/Pandas UDF
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#identifying-hot-loops-python-profilers">
Identifying Hot Loops (Python Profilers)
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id6">
Driver Side
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id7">
Executor Side
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id9">
Python/Pandas UDF
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#common-exceptions-errors">
Common Exceptions / Errors
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#pyspark-sql">
PySpark SQL
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#pandas-api-on-spark">
pandas API on Spark
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#id11">
Py4j
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#stack-traces">
Stack Traces
</a>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="debugging-pyspark">
<h1>Debugging PySpark<a class="headerlink" href="#debugging-pyspark" title="Permalink to this headline">ΒΆ</a></h1>
<p>PySpark uses Spark as an engine. PySpark uses <a class="reference external" href="https://www.py4j.org/">Py4J</a> to leverage Spark to submit and computes the jobs.</p>
<p>On the driver side, PySpark communicates with the driver on JVM by using <a class="reference external" href="https://www.py4j.org/">Py4J</a>.
When <a class="reference internal" href="../reference/pyspark.sql/api/pyspark.sql.SparkSession.html#pyspark.sql.SparkSession" title="pyspark.sql.SparkSession"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession</span></code></a> or <a class="reference internal" href="../reference/api/pyspark.SparkContext.html#pyspark.SparkContext" title="pyspark.SparkContext"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.SparkContext</span></code></a> is created and initialized, PySpark launches a JVM
to communicate.</p>
<p>On the executor side, Python workers execute and handle Python native functions or data. They are not launched if
a PySpark application does not require interaction between Python workers and JVMs. They are lazily launched only when
Python native functions or data have to be handled, for example, when you execute pandas UDFs or
PySpark RDD APIs.</p>
<p>This page focuses on debugging Python side of PySpark on both driver and executor sides instead of focusing on debugging
with JVM. Profiling and debugging JVM is described at <a class="reference external" href="https://spark.apache.org/developer-tools.html">Useful Developer Tools</a>.</p>
<p>Note that,</p>
<ul class="simple">
<li><p>If you are running locally, you can directly debug the driver side via using your IDE without the remote debug feature. Setting PySpark with IDEs is documented <a class="reference internal" href="setting_ide.html#pycharm"><span class="std std-ref">here</span></a>.</p></li>
<li><p><em>There are many other ways of debugging PySpark applications</em>. For example, you can remotely debug by using the open source <a class="reference external" href="https://www.pydev.org/manual_adv_remote_debugger.html">Remote Debugger</a> instead of using PyCharm Professional documented here.</p></li>
</ul>
<div class="section" id="remote-debugging-pycharm-professional">
<h2>Remote Debugging (PyCharm Professional)<a class="headerlink" href="#remote-debugging-pycharm-professional" title="Permalink to this headline">ΒΆ</a></h2>
<p>This section describes remote debugging on both driver and executor sides within a single machine to demonstrate easily.
The ways of debugging PySpark on the executor side is different from doing in the driver. Therefore, they will be demonstrated respectively.
In order to debug PySpark applications on other machines, please refer to the full instructions that are specific
to PyCharm, documented <a class="reference external" href="https://www.jetbrains.com/help/pycharm/remote-debugging-with-product.html">here</a>.</p>
<p>Firstly, choose <strong>Edit Configuration…</strong> from the <em>Run</em> menu. It opens the <strong>Run/Debug Configurations dialog</strong>.
You have to click <code class="docutils literal notranslate"><span class="pre">+</span></code> configuration on the toolbar, and from the list of available configurations, select <strong>Python Debug Server</strong>.
Enter the name of this new configuration, for example, <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code> and also specify the port number, for example <code class="docutils literal notranslate"><span class="pre">12345</span></code>.</p>
<img alt="PyCharm remote debugger setting" src="../_images/pyspark-remote-debug1.png" />
<div class="line-block">
<div class="line">After that, you should install the corresponding version of the <code class="docutils literal notranslate"><span class="pre">pydevd-pycharm</span></code> package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install.</div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>pip install pydevd-pycharm~=&lt;version of PyCharm on the local machine&gt;
</pre></div>
</div>
<div class="section" id="driver-side">
<h3>Driver Side<a class="headerlink" href="#driver-side" title="Permalink to this headline">ΒΆ</a></h3>
<p>To debug on the driver side, your application should be able to connect to the debugging server. Copy and paste the codes
with <code class="docutils literal notranslate"><span class="pre">pydevd_pycharm.settrace</span></code> to the top of your PySpark script. Suppose the script name is <code class="docutils literal notranslate"><span class="pre">app.py</span></code>:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;#======================Copy and paste from the previous dialog===========================</span>
<span class="s2">import pydevd_pycharm</span>
<span class="s2">pydevd_pycharm.settrace(&#39;localhost&#39;, port=12345, stdoutToServer=True, stderrToServer=True)</span>
<span class="s2">#========================================================================================</span>
<span class="s2"># Your PySpark application codes:</span>
<span class="s2">from pyspark.sql import SparkSession</span>
<span class="s2">spark = SparkSession.builder.getOrCreate()</span>
<span class="s2">spark.range(10).show()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>app.py
</pre></div>
</div>
<p>Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p>
<img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" />
<div class="line-block">
<div class="line">After that, submit your application. This will connect to your PyCharm debugging server and enable you to debug on the driver side remotely.</div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>spark-submit<span class="w"> </span>app.py
</pre></div>
</div>
</div>
<div class="section" id="executor-side">
<h3>Executor Side<a class="headerlink" href="#executor-side" title="Permalink to this headline">ΒΆ</a></h3>
<p>To debug on the executor side, prepare a Python file as below in your current working directory.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark import daemon, worker</span>
<span class="s2">def remote_debug_wrapped(*args, **kwargs):</span>
<span class="s2"> #======================Copy and paste from the previous dialog===========================</span>
<span class="s2"> import pydevd_pycharm</span>
<span class="s2"> pydevd_pycharm.settrace(&#39;localhost&#39;, port=12345, stdoutToServer=True, stderrToServer=True)</span>
<span class="s2"> #========================================================================================</span>
<span class="s2"> worker.main(*args, **kwargs)</span>
<span class="s2">daemon.worker_main = remote_debug_wrapped</span>
<span class="s2">if __name__ == &#39;__main__&#39;:</span>
<span class="s2"> daemon.manager()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>remote_debug.py
</pre></div>
</div>
<p>You will use this file as the Python worker in your PySpark applications by using the <code class="docutils literal notranslate"><span class="pre">spark.python.daemon.module</span></code> configuration.
Run the <code class="docutils literal notranslate"><span class="pre">pyspark</span></code> shell with the configuration below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark<span class="w"> </span>--conf<span class="w"> </span>spark.python.daemon.module<span class="o">=</span>remote_debug
</pre></div>
</div>
<p>Now you’re ready to remotely debug. Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p>
<img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" />
<div class="line-block">
<div class="line">After that, run a job that creates Python workers, for example, as below:</div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
</pre></div>
</div>
</div>
</div>
<div class="section" id="checking-resource-usage-top-and-ps">
<h2>Checking Resource Usage (<code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code>)<a class="headerlink" href="#checking-resource-usage-top-and-ps" title="Permalink to this headline">ΒΆ</a></h2>
<p>The Python processes on the driver and executor can be checked via typical ways such as <code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code> commands.</p>
<div class="section" id="id2">
<h3>Driver Side<a class="headerlink" href="#id2" title="Permalink to this headline">ΒΆ</a></h3>
<p>On the driver side, you can get the process id from your PySpark shell easily as below to know the process id and resources.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="p">;</span> <span class="n">os</span><span class="o">.</span><span class="n">getpid</span><span class="p">()</span>
<span class="go">18482</span>
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps<span class="w"> </span>-fe<span class="w"> </span><span class="m">18482</span>
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>UID PID PPID C STIME TTY TIME CMD
000 18482 12345 0 0:00PM ttys001 0:00.00 /.../python
</pre></div>
</div>
</div>
<div class="section" id="id3">
<h3>Executor Side<a class="headerlink" href="#id3" title="Permalink to this headline">ΒΆ</a></h3>
<p>To check on the executor side, you can simply <code class="docutils literal notranslate"><span class="pre">grep</span></code> them to figure out the process
ids and relevant resources because Python workers are forked from <code class="docutils literal notranslate"><span class="pre">pyspark.daemon</span></code>.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps<span class="w"> </span>-fe<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span>pyspark.daemon
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon
...
</pre></div>
</div>
</div>
</div>
<div class="section" id="profiling-memory-usage-memory-profiler">
<h2>Profiling Memory Usage (Memory Profiler)<a class="headerlink" href="#profiling-memory-usage-memory-profiler" title="Permalink to this headline">ΒΆ</a></h2>
<p><a class="reference external" href="https://github.com/pythonprofilers/memory_profiler">memory_profiler</a> is one of the profilers that allow you to
check the memory usage line by line.</p>
<div class="section" id="id4">
<h3>Driver Side<a class="headerlink" href="#id4" title="Permalink to this headline">ΒΆ</a></h3>
<p>Unless you are running your driver program in another machine (e.g., YARN cluster mode), this useful tool can be used
to debug the memory usage on driver side easily. Suppose your PySpark script name is <code class="docutils literal notranslate"><span class="pre">profile_memory.py</span></code>.
You can profile it as below.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark.sql import SparkSession</span>
<span class="s2">#===Your function should be decorated with @profile===</span>
<span class="s2">from memory_profiler import profile</span>
<span class="s2">@profile</span>
<span class="s2">#=====================================================</span>
<span class="s2">def my_func():</span>
<span class="s2"> session = SparkSession.builder.getOrCreate()</span>
<span class="s2"> df = session.range(10000)</span>
<span class="s2"> return df.collect()</span>
<span class="s2">if __name__ == &#39;__main__&#39;:</span>
<span class="s2"> my_func()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>profile_memory.py
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>memory_profiler<span class="w"> </span>profile_memory.py
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Filename: profile_memory.py
Line # Mem usage Increment Line Contents
================================================
...
6 def my_func():
7 51.5 MiB 0.6 MiB session = SparkSession.builder.getOrCreate()
8 51.5 MiB 0.0 MiB df = session.range(10000)
9 54.4 MiB 2.8 MiB return df.collect()
</pre></div>
</div>
</div>
<div class="section" id="python-pandas-udf">
<h3>Python/Pandas UDF<a class="headerlink" href="#python-pandas-udf" title="Permalink to this headline">ΒΆ</a></h3>
<p>PySpark provides remote <a class="reference external" href="https://github.com/pythonprofilers/memory_profiler">memory_profiler</a> for
Python/Pandas UDFs, which can be enabled by setting <code class="docutils literal notranslate"><span class="pre">spark.python.profile.memory</span></code> configuration to <code class="docutils literal notranslate"><span class="pre">true</span></code>. That
can be used on editors with line numbers such as Jupyter notebooks. An example on a Jupyter notebook is as shown below.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark<span class="w"> </span>--conf<span class="w"> </span>spark.python.profile.memory<span class="o">=</span><span class="nb">true</span>
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">add1</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span>
<span class="n">added</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">add1</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span>
<span class="n">added</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">sc</span><span class="o">.</span><span class="n">show_profiles</span><span class="p">()</span>
</pre></div>
</div>
<p>The result profile is as shown below.</p>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>============================================================
Profile of UDF&lt;id=2&gt;
============================================================
Filename: ...
Line # Mem usage Increment Occurrences Line Contents
=============================================================
4 974.0 MiB 974.0 MiB 10 @pandas_udf(&quot;long&quot;)
5 def add1(x):
6 974.4 MiB 0.4 MiB 10 return x + 1
</pre></div>
</div>
<p>The UDF IDs can be seen in the query plan, for example, <code class="docutils literal notranslate"><span class="pre">add1(...)#2L</span></code> in <code class="docutils literal notranslate"><span class="pre">ArrowEvalPython</span></code> as shown below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">added</span><span class="o">.</span><span class="n">explain</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>== Physical Plan ==
*(2) Project [pythonUDF0#11L AS add1(id)#3L]
+- ArrowEvalPython [add1(id#0L)#2L], [pythonUDF0#11L], 200
+- *(1) Range (0, 10, step=1, splits=16)
</pre></div>
</div>
<p>This feature is not supported with registered UDFs or UDFs with iterators as inputs/outputs.</p>
</div>
</div>
<div class="section" id="identifying-hot-loops-python-profilers">
<h2>Identifying Hot Loops (Python Profilers)<a class="headerlink" href="#identifying-hot-loops-python-profilers" title="Permalink to this headline">ΒΆ</a></h2>
<p><a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> are useful built-in features in Python itself. These
provide deterministic profiling of Python programs with a lot of useful statistics. This section describes how to use it on
both driver and executor sides in order to identify expensive or hot code paths.</p>
<div class="section" id="id6">
<h3>Driver Side<a class="headerlink" href="#id6" title="Permalink to this headline">ΒΆ</a></h3>
<p>To use this on driver side, you can use it as you would do for regular Python programs because PySpark on driver side is a
regular Python process unless you are running your driver program in another machine (e.g., YARN cluster mode).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;from pyspark.sql import SparkSession</span>
<span class="s2">spark = SparkSession.builder.getOrCreate()</span>
<span class="s2">spark.range(10).show()&quot;</span><span class="w"> </span>&gt;<span class="w"> </span>app.py
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>cProfile<span class="w"> </span>app.py
</pre></div>
</div>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>...
129215 function calls (125446 primitive calls) in 5.926 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1198/405 0.001 0.000 0.083 0.000 &lt;frozen importlib._bootstrap&gt;:1009(_handle_fromlist)
561 0.001 0.000 0.001 0.000 &lt;frozen importlib._bootstrap&gt;:103(release)
276 0.000 0.000 0.000 0.000 &lt;frozen importlib._bootstrap&gt;:143(__init__)
276 0.000 0.000 0.002 0.000 &lt;frozen importlib._bootstrap&gt;:147(__enter__)
...
</pre></div>
</div>
</div>
<div class="section" id="id7">
<h3>Executor Side<a class="headerlink" href="#id7" title="Permalink to this headline">ΒΆ</a></h3>
<p>To use this on executor side, PySpark provides remote <a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> for
executor side, which can be enabled by setting <code class="docutils literal notranslate"><span class="pre">spark.python.profile</span></code> configuration to <code class="docutils literal notranslate"><span class="pre">true</span></code>.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark<span class="w"> </span>--conf<span class="w"> </span>spark.python.profile<span class="o">=</span><span class="nb">true</span>
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="nb">str</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">100</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sc</span><span class="o">.</span><span class="n">show_profiles</span><span class="p">()</span>
<span class="go">============================================================</span>
<span class="go">Profile of RDD&lt;id=1&gt;</span>
<span class="go">============================================================</span>
<span class="go"> 728 function calls (692 primitive calls) in 0.004 seconds</span>
<span class="go"> Ordered by: internal time, cumulative time</span>
<span class="go"> ncalls tottime percall cumtime percall filename:lineno(function)</span>
<span class="go"> 12 0.001 0.000 0.001 0.000 serializers.py:210(load_stream)</span>
<span class="go"> 12 0.000 0.000 0.000 0.000 {built-in method _pickle.dumps}</span>
<span class="go"> 12 0.000 0.000 0.001 0.000 serializers.py:252(dump_stream)</span>
<span class="go"> 12 0.000 0.000 0.001 0.000 context.py:506(f)</span>
<span class="go">...</span>
</pre></div>
</div>
</div>
<div class="section" id="id9">
<h3>Python/Pandas UDF<a class="headerlink" href="#id9" title="Permalink to this headline">ΒΆ</a></h3>
<p>To use this on Python/Pandas UDFs, PySpark provides remote <a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> for
Python/Pandas UDFs, which can be enabled by setting <code class="docutils literal notranslate"><span class="pre">spark.python.profile</span></code> configuration to <code class="docutils literal notranslate"><span class="pre">true</span></code>.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark<span class="w"> </span>--conf<span class="w"> </span>spark.python.profile<span class="o">=</span><span class="nb">true</span>
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">add1</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">added</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">add1</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">added</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+--------+</span>
<span class="go">|add1(id)|</span>
<span class="go">+--------+</span>
<span class="go">...</span>
<span class="go">+--------+</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sc</span><span class="o">.</span><span class="n">show_profiles</span><span class="p">()</span>
<span class="go">============================================================</span>
<span class="go">Profile of UDF&lt;id=2&gt;</span>
<span class="go">============================================================</span>
<span class="go"> 2300 function calls (2270 primitive calls) in 0.006 seconds</span>
<span class="go"> Ordered by: internal time, cumulative time</span>
<span class="go"> ncalls tottime percall cumtime percall filename:lineno(function)</span>
<span class="go"> 10 0.001 0.000 0.005 0.001 series.py:5515(_arith_method)</span>
<span class="go"> 10 0.001 0.000 0.001 0.000 _ufunc_config.py:425(__init__)</span>
<span class="go"> 10 0.000 0.000 0.000 0.000 {built-in method _operator.add}</span>
<span class="go"> 10 0.000 0.000 0.002 0.000 series.py:315(__init__)</span>
<span class="go">...</span>
</pre></div>
</div>
<p>The UDF IDs can be seen in the query plan, for example, <code class="docutils literal notranslate"><span class="pre">add1(...)#2L</span></code> in <code class="docutils literal notranslate"><span class="pre">ArrowEvalPython</span></code> below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">added</span><span class="o">.</span><span class="n">explain</span><span class="p">()</span>
<span class="go">== Physical Plan ==</span>
<span class="go">*(2) Project [pythonUDF0#11L AS add1(id)#3L]</span>
<span class="go">+- ArrowEvalPython [add1(id#0L)#2L], [pythonUDF0#11L], 200</span>
<span class="go"> +- *(1) Range (0, 10, step=1, splits=16)</span>
</pre></div>
</div>
<p>This feature is not supported with registered UDFs.</p>
</div>
</div>
<div class="section" id="common-exceptions-errors">
<h2>Common Exceptions / Errors<a class="headerlink" href="#common-exceptions-errors" title="Permalink to this headline">ΒΆ</a></h2>
<div class="section" id="pyspark-sql">
<h3>PySpark SQL<a class="headerlink" href="#pyspark-sql" title="Permalink to this headline">ΒΆ</a></h3>
<p><strong>AnalysisException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">AnalysisException</span></code> is raised when failing to analyze a SQL query plan.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;bad_key&#39;</span><span class="p">]</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.AnalysisException</span>: <span class="n">Cannot resolve column name &quot;bad_key&quot; among (id)</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;id&#39;</span><span class="p">]</span>
<span class="go">Column&lt;&#39;id&#39;&gt;</span>
</pre></div>
</div>
<p><strong>ParseException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">ParseException</span></code> is raised when failing to parse a SQL command.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;select * 1&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.ParseException</span><span class="w">:</span>
<span class="x">[PARSE_SYNTAX_ERROR] Syntax error at or near &#39;1&#39;: extra input &#39;1&#39;.(line 1, pos 9)</span>
<span class="x">== SQL ==</span>
<span class="x">select * 1</span>
<span class="x">---------^^^</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;select *&quot;</span><span class="p">)</span>
<span class="go">DataFrame[]</span>
</pre></div>
</div>
<p><strong>IllegalArgumentException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">IllegalArgumentException</span></code> is raised when passing an illegal or inappropriate argument.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.errors.exceptions.IllegalArgumentException</span>: <span class="n">requirement failed: Sampling fraction (-1.0) must be on interval [0, 1] without replacement</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span>
<span class="go">DataFrame[id: bigint]</span>
</pre></div>
</div>
<p><strong>PythonException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">PythonException</span></code> is thrown from Python workers.</p>
<p>You can see the type of exception that was thrown from the Python worker and its stack trace, as <code class="docutils literal notranslate"><span class="pre">TypeError</span></code> below.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="k">as</span> <span class="nn">sf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">sf</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">udf</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">22/04/12 14:52:31 ERROR Executor: Exception in task 7.0 in stage 37.0 (TID 232)</span>
<span class="go">org.apache.spark.api.python.PythonException: Traceback (most recent call last):</span>
<span class="go">...</span>
<span class="go">TypeError: Invalid argument, not a string or column: -1 of type &lt;class &#39;int&#39;&gt;. For column literals, use &#39;lit&#39;, &#39;array&#39;, &#39;struct&#39; or &#39;create_map&#39; function.</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="nb">abs</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">udf</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="s2">&quot;id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(id=-1, abs=&#39;1&#39;), Row(id=0, abs=&#39;0&#39;)]</span>
</pre></div>
</div>
<p><strong>StreamingQueryException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">StreamingQueryException</span></code> is raised when failing a StreamingQuery. Most often, it is thrown from Python workers, that wrap it as a <code class="docutils literal notranslate"><span class="pre">PythonException</span></code>.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sdf</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">readStream</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;text&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s2">&quot;python/test_support/sql/streaming&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">udf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bad_udf</span> <span class="o">=</span> <span class="n">udf</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="mi">1</span> <span class="o">/</span> <span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">bad_udf</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;value&quot;</span><span class="p">)))</span><span class="o">.</span><span class="n">writeStream</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;memory&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">queryName</span><span class="p">(</span><span class="s2">&quot;q1&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">start</span><span class="p">())</span><span class="o">.</span><span class="n">processAllAvailable</span><span class="p">()</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">org.apache.spark.api.python.PythonException</span>: <span class="n">Traceback (most recent call last):</span>
File <span class="nb">&quot;&lt;stdin&gt;&quot;</span>, line <span class="m">1</span>, in <span class="n">&lt;lambda&gt;</span>
<span class="gr">ZeroDivisionError</span>: <span class="n">division by zero</span>
<span class="x">...</span>
<span class="x">pyspark.errors.exceptions.StreamingQueryException: [STREAM_FAILED] Query [id = 74eb53a8-89bd-49b0-9313-14d29eed03aa, runId = 9f2d5cf6-a373-478d-b718-2c2b6d8a0f24] terminated with exception: Job aborted</span>
</pre></div>
</div>
<p>Solution:</p>
<p>Fix the StreamingQuery and re-execute the workflow.</p>
<p><strong>SparkUpgradeException</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">SparkUpgradeException</span></code> is thrown because of Spark upgrade.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">to_date</span><span class="p">,</span> <span class="n">unix_timestamp</span><span class="p">,</span> <span class="n">from_unixtime</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;2014-31-12&quot;</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;date_str&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="n">to_date</span><span class="p">(</span><span class="n">from_unixtime</span><span class="p">(</span><span class="n">unix_timestamp</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="s2">&quot;yyyy-dd-aa&quot;</span><span class="p">))))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">pyspark.sql.utils.SparkUpgradeException</span>: <span class="n">You may get a different result due to the upgrading to Spark &gt;= 3.0: Fail to recognize &#39;yyyy-dd-aa&#39; pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;spark.sql.legacy.timeParserPolicy&quot;</span><span class="p">,</span> <span class="s2">&quot;LEGACY&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="n">to_date</span><span class="p">(</span><span class="n">from_unixtime</span><span class="p">(</span><span class="n">unix_timestamp</span><span class="p">(</span><span class="s2">&quot;date_str&quot;</span><span class="p">,</span> <span class="s2">&quot;yyyy-dd-aa&quot;</span><span class="p">))))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(date_str=&#39;2014-31-12&#39;, to_date(from_unixtime(unix_timestamp(date_str, yyyy-dd-aa), yyyy-MM-dd HH:mm:ss))=None)]</span>
</pre></div>
</div>
</div>
<div class="section" id="pandas-api-on-spark">
<h3>pandas API on Spark<a class="headerlink" href="#pandas-api-on-spark" title="Permalink to this headline">ΒΆ</a></h3>
<p>There are specific common exceptions / errors in pandas API on Spark.</p>
<p><strong>ValueError: Cannot combine the series or dataframe because it comes from a different dataframe</strong></p>
<p>Operations involving more than one series or dataframes raises a <code class="docutils literal notranslate"><span class="pre">ValueError</span></code> if <code class="docutils literal notranslate"><span class="pre">compute.ops_on_diff_frames</span></code> is disabled (disabled by default). Such operations may be expensive due to joining of underlying Spark frames. So users should be aware of the cost and enable that flag only when necessary.</p>
<p>Exception:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">ValueError</span>: <span class="n">Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable &#39;compute.ops_on_diff_frames&#39; option.</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s1">&#39;compute.ops_on_diff_frames&#39;</span><span class="p">,</span> <span class="kc">True</span><span class="p">):</span>
<span class="gp">... </span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span> <span class="o">+</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
<span class="gp">...</span>
<span class="go">0 4</span>
<span class="go">1 6</span>
<span class="go">dtype: int64</span>
</pre></div>
</div>
<p><strong>RuntimeError: Result vector from pandas_udf was not the required length</strong></p>
<p>Exception:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;x&quot;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s2">&quot;y&quot;</span><span class="p">:[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="go">22/04/12 13:46:39 ERROR Executor: Exception in task 2.0 in stage 16.0 (TID 88)</span>
<span class="go">org.apache.spark.api.python.PythonException: Traceback (most recent call last):</span>
<span class="go">...</span>
<span class="go">RuntimeError: Result vector from pandas_udf was not the required length: expected 1, got 0</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">x</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;x&quot;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s2">&quot;y&quot;</span><span class="p">:[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="go"> x y</span>
<span class="go">0 1 3</span>
<span class="go">1 2 4</span>
</pre></div>
</div>
</div>
<div class="section" id="id11">
<h3>Py4j<a class="headerlink" href="#id11" title="Permalink to this headline">ΒΆ</a></h3>
<p><strong>Py4JJavaError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JJavaError</span></code> is raised when an exception occurs in the Java client code.
You can see the type of exception that was thrown on the Java side and its stack trace, as <code class="docutils literal notranslate"><span class="pre">java.lang.NullPointerException</span></code> below.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">py4j.protocol.Py4JJavaError</span>: <span class="n">An error occurred while calling None.java.lang.String.</span>
<span class="x">: java.lang.NullPointerException</span>
<span class="x">..</span>
</pre></div>
</div>
<p>Solution:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">)</span>
<span class="go">&#39;x&#39;</span>
</pre></div>
</div>
<p><strong>Py4JError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JError</span></code> is raised when any other error occurs such as when the Python client program tries to access an object that no longer exists on the Java side.</p>
<p>Example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.regression</span> <span class="kn">import</span> <span class="n">LinearRegression</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;weight&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">],</span>
<span class="gp">... </span> <span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LinearRegression</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">solver</span><span class="o">=</span><span class="s2">&quot;normal&quot;</span><span class="p">,</span> <span class="n">weightCol</span><span class="o">=</span><span class="s2">&quot;weight&quot;</span><span class="p">,</span> <span class="n">fitIntercept</span><span class="o">=</span><span class="kc">False</span>
<span class="gp">... </span> <span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span>
<span class="go">LinearRegressionModel: uid=LinearRegression_eb7bc1d4bf25, numFeatures=1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="fm">__del__</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="c">...</span>
<span class="gr">py4j.protocol.Py4JError</span>: <span class="n">An error occurred while calling o531.toString. Trace:</span>
<span class="x">py4j.Py4JException: Target Object ID does not exist for this gateway :o531</span>
<span class="x">...</span>
</pre></div>
</div>
<p>Solution:</p>
<p>Access an object that exists on the Java side.</p>
<p><strong>Py4JNetworkError</strong></p>
<p><code class="docutils literal notranslate"><span class="pre">Py4JNetworkError</span></code> is raised when a problem occurs during network transfer (e.g., connection lost). In this case, we shall debug the network and rebuild the connection.</p>
</div>
</div>
<div class="section" id="stack-traces">
<h2>Stack Traces<a class="headerlink" href="#stack-traces" title="Permalink to this headline">ΒΆ</a></h2>
<p>There are Spark configurations to control stack traces:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled</span></code> is true by default to simplify traceback from Python UDFs.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.jvmStacktrace.enabled</span></code> is false by default to hide JVM stacktrace and to show a Python-friendly exception only.</p></li>
</ul>
<p>Spark configurations above are independent from log level settings. Control log levels through <a class="reference internal" href="../reference/api/pyspark.SparkContext.setLogLevel.html#pyspark.SparkContext.setLogLevel" title="pyspark.SparkContext.setLogLevel"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.setLogLevel()</span></code></a>.</p>
</div>
</div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
<a class='left-prev' id="prev-link" href="testing.html" title="previous page">
<i class="fas fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Testing PySpark</p>
</div>
</a>
<a class='right-next' id="next-link" href="setting_ide.html" title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Setting up IDEs</p>
</div>
<i class="fas fa-angle-right"></i>
</a>
</div>
</main>
</div>
</div>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>