| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>Debugging PySpark — PySpark 3.1.1 documentation</title> |
| |
| <link rel="stylesheet" href="../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/development/debugging.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Setting up IDEs" href="setting_ide.html" /> |
| <link rel="prev" title="Testing PySpark" href="testing.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../index.html"> |
| |
| <img src="../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item active"> |
| <a class="nav-link" href="index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""> |
| <a href="contributing.html">Contributing to PySpark</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="testing.html">Testing PySpark</a> |
| </li> |
| |
| |
| |
| <li class="active"> |
| <a href="">Debugging PySpark</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="setting_ide.html">Setting up IDEs</a> |
| </li> |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#remote-debugging-pycharm-professional" class="nav-link">Remote Debugging (PyCharm Professional)</a><ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#driver-side" class="nav-link">Driver Side</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#executor-side" class="nav-link">Executor Side</a> |
| </li> |
| |
| </ul> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#checking-resource-usage-top-and-ps" class="nav-link">Checking Resource Usage (top and ps)</a><ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#id2" class="nav-link">Driver Side</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#id3" class="nav-link">Executor Side</a> |
| </li> |
| |
| </ul> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#profiling-memory-usage-memory-profiler" class="nav-link">Profiling Memory Usage (Memory Profiler)</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#identifying-hot-loops-python-profilers" class="nav-link">Identifying Hot Loops (Python Profilers)</a><ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#id4" class="nav-link">Driver Side</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#id5" class="nav-link">Executor Side</a> |
| </li> |
| |
| </ul> |
| </li> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <div class="section" id="debugging-pyspark"> |
| <h1>Debugging PySpark<a class="headerlink" href="#debugging-pyspark" title="Permalink to this headline">¶</a></h1> |
| <p>PySpark uses Spark as an engine. PySpark uses <a class="reference external" href="https://www.py4j.org/">Py4J</a> to leverage Spark to submit and computes the jobs.</p> |
| <p>On the driver side, PySpark communicates with the driver on JVM by using <a class="reference external" href="https://www.py4j.org/">Py4J</a>. |
| When <a class="reference internal" href="../reference/api/pyspark.sql.SparkSession.html#pyspark.sql.SparkSession" title="pyspark.sql.SparkSession"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession</span></code></a> or <a class="reference internal" href="../reference/api/pyspark.SparkContext.html#pyspark.SparkContext" title="pyspark.SparkContext"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.SparkContext</span></code></a> is created and initialized, PySpark launches a JVM |
| to communicate.</p> |
| <p>On the executor side, Python workers execute and handle Python native functions or data. They are not launched if |
| a PySpark application does not require interaction between Python workers and JVMs. They are lazily launched only when |
| Python native functions or data have to be handled, for example, when you execute pandas UDFs or |
| PySpark RDD APIs.</p> |
| <p>This page focuses on debugging Python side of PySpark on both driver and executor sides instead of focusing on debugging |
| with JVM. Profiling and debugging JVM is described at <a class="reference external" href="https://spark.apache.org/developer-tools.html">Useful Developer Tools</a>.</p> |
| <p>Note that,</p> |
| <ul class="simple"> |
| <li><p>If you are running locally, you can directly debug the driver side via using your IDE without the remote debug feature. Setting PySpark with IDEs is documented <a class="reference internal" href="setting_ide.html#pycharm"><span class="std std-ref">here</span></a>.</p></li> |
| <li><p><em>There are many other ways of debugging PySpark applications</em>. For example, you can remotely debug by using the open source <a class="reference external" href="https://www.pydev.org/manual_adv_remote_debugger.html">Remote Debugger</a> instead of using PyCharm Professional documented here.</p></li> |
| </ul> |
| <div class="section" id="remote-debugging-pycharm-professional"> |
| <h2>Remote Debugging (PyCharm Professional)<a class="headerlink" href="#remote-debugging-pycharm-professional" title="Permalink to this headline">¶</a></h2> |
| <p>This section describes remote debugging on both driver and executor sides within a single machine to demonstrate easily. |
| The ways of debugging PySpark on the executor side is different from doing in the driver. Therefore, they will be demonstrated respectively. |
| In order to debug PySpark applications on other machines, please refer to the full instructions that are specific |
| to PyCharm, documented <a class="reference external" href="https://www.jetbrains.com/help/pycharm/remote-debugging-with-product.html">here</a>.</p> |
| <p>Firstly, choose <strong>Edit Configuration…</strong> from the <em>Run</em> menu. It opens the <strong>Run/Debug Configurations dialog</strong>. |
| You have to click <code class="docutils literal notranslate"><span class="pre">+</span></code> configuration on the toolbar, and from the list of available configurations, select <strong>Python Debug Server</strong>. |
| Enter the name of this new configuration, for example, <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code> and also specify the port number, for example <code class="docutils literal notranslate"><span class="pre">12345</span></code>.</p> |
| <img alt="PyCharm remote debugger setting" src="../_images/pyspark-remote-debug1.png" /> |
| <div class="line-block"> |
| <div class="line">After that, you should install the corresponding version of the <code class="docutils literal notranslate"><span class="pre">pydevd-pycharm</span></code> package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install.</div> |
| </div> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>pip install pydevd-pycharm~=<version of PyCharm on the local machine> |
| </pre></div> |
| </div> |
| <div class="section" id="driver-side"> |
| <h3>Driver Side<a class="headerlink" href="#driver-side" title="Permalink to this headline">¶</a></h3> |
| <p>To debug on the driver side, your application should be able to connect to the debugging server. Copy and paste the codes |
| with <code class="docutils literal notranslate"><span class="pre">pydevd_pycharm.settrace</span></code> to the top of your PySpark script. Suppose the script name is <code class="docutils literal notranslate"><span class="pre">app.py</span></code>:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span> <span class="s2">"#======================Copy and paste from the previous dialog===========================</span> |
| <span class="s2">import pydevd_pycharm</span> |
| <span class="s2">pydevd_pycharm.settrace('localhost', port=12345, stdoutToServer=True, stderrToServer=True)</span> |
| <span class="s2">#========================================================================================</span> |
| <span class="s2"># Your PySpark application codes:</span> |
| <span class="s2">from pyspark.sql import SparkSession</span> |
| <span class="s2">spark = SparkSession.builder.getOrCreate()</span> |
| <span class="s2">spark.range(10).show()"</span> > app.py |
| </pre></div> |
| </div> |
| <p>Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p> |
| <img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" /> |
| <div class="line-block"> |
| <div class="line">After that, submit your application. This will connect to your PyCharm debugging server and enable you to debug on the driver side remotely.</div> |
| </div> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>spark-submit app.py |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="executor-side"> |
| <h3>Executor Side<a class="headerlink" href="#executor-side" title="Permalink to this headline">¶</a></h3> |
| <p>To debug on the executor side, prepare a Python file as below in your current working directory.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span> <span class="s2">"from pyspark import daemon, worker</span> |
| <span class="s2">def remote_debug_wrapped(*args, **kwargs):</span> |
| <span class="s2"> #======================Copy and paste from the previous dialog===========================</span> |
| <span class="s2"> import pydevd_pycharm</span> |
| <span class="s2"> pydevd_pycharm.settrace('localhost', port=12345, stdoutToServer=True, stderrToServer=True)</span> |
| <span class="s2"> #========================================================================================</span> |
| <span class="s2"> worker.main(*args, **kwargs)</span> |
| <span class="s2">daemon.worker_main = remote_debug_wrapped</span> |
| <span class="s2">if __name__ == '__main__':</span> |
| <span class="s2"> daemon.manager()"</span> > remote_debug.py |
| </pre></div> |
| </div> |
| <p>You will use this file as the Python worker in your PySpark applications by using the <code class="docutils literal notranslate"><span class="pre">spark.python.daemon.module</span></code> configuration. |
| Run the <code class="docutils literal notranslate"><span class="pre">pyspark</span></code> shell with the configuration below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark --conf spark.python.daemon.module<span class="o">=</span>remote_debug |
| </pre></div> |
| </div> |
| <p>Now you’re ready to remotely debug. Start to debug with your <code class="docutils literal notranslate"><span class="pre">MyRemoteDebugger</span></code>.</p> |
| <img alt="PyCharm run remote debugger" src="../_images/pyspark-remote-debug2.png" /> |
| <div class="line-block"> |
| <div class="line">After that, run a job that creates Python workers, for example, as below:</div> |
| </div> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| </div> |
| <div class="section" id="checking-resource-usage-top-and-ps"> |
| <h2>Checking Resource Usage (<code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code>)<a class="headerlink" href="#checking-resource-usage-top-and-ps" title="Permalink to this headline">¶</a></h2> |
| <p>The Python processes on the driver and executor can be checked via typical ways such as <code class="docutils literal notranslate"><span class="pre">top</span></code> and <code class="docutils literal notranslate"><span class="pre">ps</span></code> commands.</p> |
| <div class="section" id="id2"> |
| <h3>Driver Side<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h3> |
| <p>On the driver side, you can get the process id from your PySpark shell easily as below to know the process id and resources.</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">import</span> <span class="nn">os</span><span class="p">;</span> <span class="n">os</span><span class="o">.</span><span class="n">getpid</span><span class="p">()</span> |
| <span class="go">18482</span> |
| </pre></div> |
| </div> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps -fe <span class="m">18482</span> |
| </pre></div> |
| </div> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>UID PID PPID C STIME TTY TIME CMD |
| 000 18482 12345 0 0:00PM ttys001 0:00.00 /.../python |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="id3"> |
| <h3>Executor Side<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h3> |
| <p>To check on the executor side, you can simply <code class="docutils literal notranslate"><span class="pre">grep</span></code> them to figure out the process |
| ids and relevant resources because Python workers are forked from <code class="docutils literal notranslate"><span class="pre">pyspark.daemon</span></code>.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>ps -fe <span class="p">|</span> grep pyspark.daemon |
| </pre></div> |
| </div> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon |
| 000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon |
| 000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon |
| 000 12345 1 0 0:00PM ttys000 0:00.00 /.../python -m pyspark.daemon |
| ... |
| </pre></div> |
| </div> |
| </div> |
| </div> |
| <div class="section" id="profiling-memory-usage-memory-profiler"> |
| <h2>Profiling Memory Usage (Memory Profiler)<a class="headerlink" href="#profiling-memory-usage-memory-profiler" title="Permalink to this headline">¶</a></h2> |
| <p><a class="reference external" href="https://github.com/pythonprofilers/memory_profiler">memory_profiler</a> is one of the profilers that allow you to |
| check the memory usage line by line. This method documented here <em>only works for the driver side</em>.</p> |
| <p>Unless you are running your driver program in another machine (e.g., YARN cluster mode), this useful tool can be used |
| to debug the memory usage on driver side easily. Suppose your PySpark script name is <code class="docutils literal notranslate"><span class="pre">profile_memory.py</span></code>. |
| You can profile it as below.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span> <span class="s2">"from pyspark.sql import SparkSession</span> |
| <span class="s2">#===Your function should be decorated with @profile===</span> |
| <span class="s2">from memory_profiler import profile</span> |
| <span class="s2">@profile</span> |
| <span class="s2">#=====================================================</span> |
| <span class="s2">def my_func():</span> |
| <span class="s2"> session = SparkSession.builder.getOrCreate()</span> |
| <span class="s2"> df = session.range(10000)</span> |
| <span class="s2"> return df.collect()</span> |
| <span class="s2">if __name__ == '__main__':</span> |
| <span class="s2"> my_func()"</span> > profile_memory.py |
| </pre></div> |
| </div> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python -m memory_profiler profile_memory.py |
| </pre></div> |
| </div> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Filename: profile_memory.py |
| |
| Line # Mem usage Increment Line Contents |
| ================================================ |
| ... |
| 6 def my_func(): |
| 7 51.5 MiB 0.6 MiB session = SparkSession.builder.getOrCreate() |
| 8 51.5 MiB 0.0 MiB df = session.range(10000) |
| 9 54.4 MiB 2.8 MiB return df.collect() |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="identifying-hot-loops-python-profilers"> |
| <h2>Identifying Hot Loops (Python Profilers)<a class="headerlink" href="#identifying-hot-loops-python-profilers" title="Permalink to this headline">¶</a></h2> |
| <p><a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> are useful built-in features in Python itself. These |
| provide deterministic profiling of Python programs with a lot of useful statistics. This section describes how to use it on |
| both driver and executor sides in order to identify expensive or hot code paths.</p> |
| <div class="section" id="id4"> |
| <h3>Driver Side<a class="headerlink" href="#id4" title="Permalink to this headline">¶</a></h3> |
| <p>To use this on driver side, you can use it as you would do for regular Python programs because PySpark on driver side is a |
| regular Python process unless you are running your driver program in another machine (e.g., YARN cluster mode).</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">echo</span> <span class="s2">"from pyspark.sql import SparkSession</span> |
| <span class="s2">spark = SparkSession.builder.getOrCreate()</span> |
| <span class="s2">spark.range(10).show()"</span> > app.py |
| </pre></div> |
| </div> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python -m cProfile app.py |
| </pre></div> |
| </div> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>... |
| 129215 function calls (125446 primitive calls) in 5.926 seconds |
| |
| Ordered by: standard name |
| |
| ncalls tottime percall cumtime percall filename:lineno(function) |
| 1198/405 0.001 0.000 0.083 0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist) |
| 561 0.001 0.000 0.001 0.000 <frozen importlib._bootstrap>:103(release) |
| 276 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:143(__init__) |
| 276 0.000 0.000 0.002 0.000 <frozen importlib._bootstrap>:147(__enter__) |
| ... |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="id5"> |
| <h3>Executor Side<a class="headerlink" href="#id5" title="Permalink to this headline">¶</a></h3> |
| <p>To use this on executor side, PySpark provides remote <a class="reference external" href="https://docs.python.org/3/library/profile.html">Python Profilers</a> for |
| executor side, which can be enabled by setting <code class="docutils literal notranslate"><span class="pre">spark.python.profile</span></code> configuration to <code class="docutils literal notranslate"><span class="pre">true</span></code>.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pyspark --conf spark.python.profile<span class="o">=</span><span class="nb">true</span> |
| </pre></div> |
| </div> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="nb">str</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">rdd</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> |
| <span class="go">100</span> |
| <span class="gp">>>> </span><span class="n">sc</span><span class="o">.</span><span class="n">show_profiles</span><span class="p">()</span> |
| <span class="go">============================================================</span> |
| <span class="go">Profile of RDD<id=1></span> |
| <span class="go">============================================================</span> |
| <span class="go"> 728 function calls (692 primitive calls) in 0.004 seconds</span> |
| |
| <span class="go"> Ordered by: internal time, cumulative time</span> |
| |
| <span class="go"> ncalls tottime percall cumtime percall filename:lineno(function)</span> |
| <span class="go"> 12 0.001 0.000 0.001 0.000 serializers.py:210(load_stream)</span> |
| <span class="go"> 12 0.000 0.000 0.000 0.000 {built-in method _pickle.dumps}</span> |
| <span class="go"> 12 0.000 0.000 0.001 0.000 serializers.py:252(dump_stream)</span> |
| <span class="go"> 12 0.000 0.000 0.001 0.000 context.py:506(f)</span> |
| <span class="gp">...</span> |
| </pre></div> |
| </div> |
| <p>This feature is supported only with RDD APIs.</p> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| <a class='left-prev' id="prev-link" href="testing.html" title="previous page">Testing PySpark</a> |
| <a class='right-next' id="next-link" href="setting_ide.html" title="next page">Setting up IDEs</a> |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |