site/docs/4.0.1/streaming/performance-tips.html - spark-website - Git at Google



 <!DOCTYPE html>
 <html class="no-js">
     <head>
         <meta charset="utf-8">
         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
         <meta name="viewport" content="width=device-width, initial-scale=1.0">

         <title>Structured Streaming Programming Guide - Spark 4.0.1 Documentation</title>


         <link rel="stylesheet" href="../css/bootstrap.min.css">
         <link rel="preconnect" href="https://fonts.googleapis.com">
         <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
         <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
         <link href="../css/custom.css" rel="stylesheet">
         <script src="../js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>

         <link rel="stylesheet" href="../css/pygments-default.css">
         <link rel="stylesheet" href="../css/docsearch.min.css" />
         <link rel="stylesheet" href="../css/docsearch.css">


         <!-- Matomo -->
         <script>
             var _paq = window._paq = window._paq || [];
             /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
             _paq.push(["disableCookies"]);
             _paq.push(['trackPageView']);
             _paq.push(['enableLinkTracking']);
             (function() {
               var u="https://analytics.apache.org/";
               _paq.push(['setTrackerUrl', u+'matomo.php']);
               _paq.push(['setSiteId', '40']);
               var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
               g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
             })();
         </script>
         <!-- End Matomo Code -->


     </head>
     <body class="global">
         <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
         <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
             <div class="navbar-brand"><a href="../index.html">
                 <img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.0.1</span>
             </div>
             <button class="navbar-toggler" type="button" data-toggle="collapse"
                     data-target="#navbarCollapse" aria-controls="navbarCollapse"
                     aria-expanded="false" aria-label="Toggle navigation">
                 <span class="navbar-toggler-icon"></span>
             </button>
             <div class="collapse navbar-collapse" id="navbarCollapse">
                 <ul class="navbar-nav me-auto">
                     <li class="nav-item"><a href="../index.html" class="nav-link">Overview</a></li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
                         <div class="dropdown-menu" aria-labelledby="navbarQuickStart">
                             <a class="dropdown-item" href="../quick-start.html">Quick Start</a>
                             <a class="dropdown-item" href="../rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
                             <a class="dropdown-item" href="../sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
                             <a class="dropdown-item" href="../streaming/index.html">Structured Streaming</a>
                             <a class="dropdown-item" href="../streaming-programming-guide.html">Spark Streaming (DStreams)</a>
                             <a class="dropdown-item" href="../ml-guide.html">MLlib (Machine Learning)</a>
                             <a class="dropdown-item" href="../graphx-programming-guide.html">GraphX (Graph Processing)</a>
                             <a class="dropdown-item" href="../sparkr.html">SparkR (R on Spark)</a>
                             <a class="dropdown-item" href="../api/python/getting_started/index.html">PySpark (Python on Spark)</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
                         <div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
                             <a class="dropdown-item" href="../api/python/index.html">Python</a>
                             <a class="dropdown-item" href="../api/scala/org/apache/spark/index.html">Scala</a>
                             <a class="dropdown-item" href="../api/java/index.html">Java</a>
                             <a class="dropdown-item" href="../api/R/index.html">R</a>
                             <a class="dropdown-item" href="../api/sql/index.html">SQL, Built-in Functions</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
                         <div class="dropdown-menu" aria-labelledby="navbarDeploying">
                             <a class="dropdown-item" href="../cluster-overview.html">Overview</a>
                             <a class="dropdown-item" href="../submitting-applications.html">Submitting Applications</a>
                             <div class="dropdown-divider"></div>
                             <a class="dropdown-item" href="../spark-standalone.html">Spark Standalone</a>
                             <a class="dropdown-item" href="../running-on-yarn.html">YARN</a>
                             <a class="dropdown-item" href="../running-on-kubernetes.html">Kubernetes</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
                         <div class="dropdown-menu" aria-labelledby="navbarMore">
                             <a class="dropdown-item" href="../configuration.html">Configuration</a>
                             <a class="dropdown-item" href="../monitoring.html">Monitoring</a>
                             <a class="dropdown-item" href="../tuning.html">Tuning Guide</a>
                             <a class="dropdown-item" href="../job-scheduling.html">Job Scheduling</a>
                             <a class="dropdown-item" href="../security.html">Security</a>
                             <a class="dropdown-item" href="../hardware-provisioning.html">Hardware Provisioning</a>
                             <a class="dropdown-item" href="../migration-guide.html">Migration Guide</a>
                             <div class="dropdown-divider"></div>
                             <a class="dropdown-item" href="../building-spark.html">Building Spark</a>
                             <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
                             <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
                         </div>
                     </li>

                     <li class="nav-item">
                         <input type="text" id="docsearch-input" placeholder="Search the docs…">
                     </li>
                 </ul>
                 <!--<span class="navbar-text navbar-right"><span class="version-text">v4.0.1</span></span>-->
             </div>
         </nav>


         <div class="container">


 <div class="left-menu-wrapper">
     <div class="left-menu">
         <h3><a href="../streaming/index.html">Structured Streaming Programming Guide</a></h3>

 <ul>

     <li>
         <a href="../streaming/index.html">

                 Overview

         </a>
     </li>


     <li>
         <a href="../streaming/getting-started.html">

                 Getting Started

         </a>
     </li>


     <li>
         <a href="../streaming/apis-on-dataframes-and-datasets.html">

                 APIs on DataFrames and Datasets

         </a>
     </li>


     <li>
         <a href="../streaming/performance-tips.html">

                 Performance Tips

         </a>
     </li>


 <ul>

     <li>
         <a href="../streaming/performance-tips.html#asynchronous-progress-tracking">

                 Asynchronous Progress Tracking

         </a>
     </li>


     <li>
         <a href="../streaming/performance-tips.html#continuous-processing">

                 Continuous Processing

         </a>
     </li>


 </ul>


     <li>
         <a href="../streaming/additional-information.html">

                 Additional Information

         </a>
     </li>


 </ul>

     </div>
 </div>


                 <input id="nav-trigger" class="nav-trigger" checked type="checkbox">
                 <label for="nav-trigger"></label>
                 <div class="content-with-sidebar mr-3" id="content">

                         <h1 class="title">Structured Streaming Programming Guide</h1>


                     <ul>
   <li>Table of contents</li>
 </ul>

 <h1 id="asynchronous-progress-tracking">Asynchronous Progress Tracking</h1>
 <h2 id="what-is-it">What is it?</h2>

 <p>Asynchronous progress tracking allows streaming queries to checkpoint progress asynchronously and in parallel to the actual data processing within a micro-batch, reducing latency associated with maintaining the offset log and commit log.</p>

 <p><img src="../img/async-progress.png" alt="Async Progress Tracking" /></p>

 <h2 id="how-does-it-work">How does it work?</h2>

 <p>Structured Streaming relies on persisting and managing offsets as progress indicators for query processing. Offset management operation directly impacts processing latency, because no data processing can occur until these operations are complete. Asynchronous progress tracking enables streaming queries to checkpoint progress without being impacted by these offset management operations.</p>

 <h2 id="how-to-use-it">How to use it?</h2>

 <p>The code snippet below provides an example of how to use this feature:</p>
 <div class="language-scala highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">val</span> <span class="nv">stream</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">readStream</span>
       <span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
       <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"kafka.bootstrap.servers"</span><span class="o">,</span> <span class="s">"host1:port1,host2:port2"</span><span class="o">)</span>
       <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"subscribe"</span><span class="o">,</span> <span class="s">"in"</span><span class="o">)</span>
       <span class="o">.</span><span class="py">load</span><span class="o">()</span>
 <span class="k">val</span> <span class="nv">query</span> <span class="k">=</span> <span class="nv">stream</span><span class="o">.</span><span class="py">writeStream</span>
      <span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
      <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"topic"</span><span class="o">,</span> <span class="s">"out"</span><span class="o">)</span>
      <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"checkpointLocation"</span><span class="o">,</span> <span class="s">"/tmp/checkpoint"</span><span class="o">)</span>
      <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"asyncProgressTrackingEnabled"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
      <span class="o">.</span><span class="py">start</span><span class="o">()</span>
 </code></pre></div></div>

 <p>The table below describes the configurations for this feature and default values associated with them.</p>

 <table>
   <thead>
     <tr>
       <th>Option</th>
       <th>Value</th>
       <th>Default</th>
       <th>Description</th>
     </tr>
   </thead>
   <tbody>
     <tr>
       <td>asyncProgressTrackingEnabled</td>
       <td>true/false</td>
       <td>false</td>
       <td>enable or disable asynchronous progress tracking</td>
     </tr>
     <tr>
       <td>asyncProgressTrackingCheckpointIntervalMs</td>
       <td>millisecond</td>
       <td>1000</td>
       <td>the interval in which we commit offsets and completion commits</td>
     </tr>
   </tbody>
 </table>

 <h2 id="limitations">Limitations</h2>
 <p>The initial version of the feature has the following limitations:</p>

 <ul>
   <li>Asynchronous progress tracking is only supported in stateless queries using Kafka Sink</li>
   <li>Exactly once end-to-end processing will not be supported with this asynchronous progress tracking because offset ranges for batch can be changed in case of failure. Though many sinks, such as Kafka sink, do not support writing exactly once anyways.</li>
 </ul>

 <h2 id="switching-the-setting-off">Switching the setting off</h2>
 <p>Turning the async progress tracking off may cause the following exception to be thrown</p>

 <div class="language-scala highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">java</span><span class="o">.</span><span class="py">lang</span><span class="o">.</span><span class="py">IllegalStateException</span><span class="k">:</span> <span class="kt">batch</span> <span class="kt">x</span> <span class="kt">doesn</span><span class="err">'</span><span class="kt">t</span> <span class="kt">exist</span>
 </code></pre></div></div>

 <p>Also the following error message may be printed in the driver logs:</p>

 <div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>The offset log for batch x doesn't exist, which is required to restart the query from the latest batch x from the offset log. Please ensure there are two subsequent offset logs available for the latest batch via manually deleting the offset file(s). Please also ensure the latest batch for commit log is equal or one batch earlier than the latest batch for offset log.
 </code></pre></div></div>

 <p>This is caused by the fact that when async progress tracking is enabled, the framework will not checkpoint progress for every batch as would be done if async progress tracking is not used. To solve this problem simply re-enable “asyncProgressTrackingEnabled” and set “asyncProgressTrackingCheckpointIntervalMs” to 0 and run the streaming query until at least two micro-batches have been processed. Async progress tracking can be now safely disabled and restarting query should proceed normally.</p>

 <h1 id="continuous-processing">Continuous Processing</h1>
 <h2 id="experimental">[Experimental]</h2>

 <p><strong>Continuous processing</strong> is a new, experimental streaming execution mode introduced in Spark 2.3 that enables low (~1 ms) end-to-end latency with at-least-once fault-tolerance guarantees. Compare this with the default <em>micro-batch processing</em> engine which can achieve exactly-once guarantees but achieve latencies of ~100ms at best. For some types of queries (discussed below), you can choose which mode to execute them in without modifying the application logic (i.e. without changing the DataFrame/Dataset operations).</p>

 <p>To run a supported query in continuous processing mode, all you need to do is specify a <strong>continuous trigger</strong> with the desired checkpoint interval as a parameter. For example,</p>

 <div class="codetabs">

 <div data-lang="python">

     <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">spark</span> \
   <span class="p">.</span><span class="n">readStream</span> \
   <span class="p">.</span><span class="nf">format</span><span class="p">(</span><span class="sh">"</span><span class="s">kafka</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">kafka.bootstrap.servers</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">host1:port1,host2:port2</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">subscribe</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">topic1</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">load</span><span class="p">()</span> \
   <span class="p">.</span><span class="nf">selectExpr</span><span class="p">(</span><span class="sh">"</span><span class="s">CAST(key AS STRING)</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">CAST(value AS STRING)</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="n">writeStream</span> \
   <span class="p">.</span><span class="nf">format</span><span class="p">(</span><span class="sh">"</span><span class="s">kafka</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">kafka.bootstrap.servers</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">host1:port1,host2:port2</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">topic</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">topic1</span><span class="sh">"</span><span class="p">)</span> \
   <span class="p">.</span><span class="nf">trigger</span><span class="p">(</span><span class="n">continuous</span><span class="o">=</span><span class="sh">"</span><span class="s">1 second</span><span class="sh">"</span><span class="p">)</span> \     <span class="c1"># only change in query
 </span>  <span class="p">.</span><span class="nf">start</span><span class="p">()</span></code></pre></figure>

   </div>

 <div data-lang="scala">

     <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.streaming.Trigger</span>

 <span class="n">spark</span>
   <span class="o">.</span><span class="py">readStream</span>
   <span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"kafka.bootstrap.servers"</span><span class="o">,</span> <span class="s">"host1:port1,host2:port2"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"subscribe"</span><span class="o">,</span> <span class="s">"topic1"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">load</span><span class="o">()</span>
   <span class="o">.</span><span class="py">selectExpr</span><span class="o">(</span><span class="s">"CAST(key AS STRING)"</span><span class="o">,</span> <span class="s">"CAST(value AS STRING)"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">writeStream</span>
   <span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"kafka.bootstrap.servers"</span><span class="o">,</span> <span class="s">"host1:port1,host2:port2"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"topic"</span><span class="o">,</span> <span class="s">"topic1"</span><span class="o">)</span>
   <span class="o">.</span><span class="py">trigger</span><span class="o">(</span><span class="nv">Trigger</span><span class="o">.</span><span class="py">Continuous</span><span class="o">(</span><span class="s">"1 second"</span><span class="o">))</span>  <span class="c1">// only change in query</span>
   <span class="o">.</span><span class="py">start</span><span class="o">()</span></code></pre></figure>

   </div>

 <div data-lang="java">

     <figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.sql.streaming.Trigger</span><span class="o">;</span>

 <span class="n">spark</span>
   <span class="o">.</span><span class="na">readStream</span>
   <span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"kafka.bootstrap.servers"</span><span class="o">,</span> <span class="s">"host1:port1,host2:port2"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"subscribe"</span><span class="o">,</span> <span class="s">"topic1"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">load</span><span class="o">()</span>
   <span class="o">.</span><span class="na">selectExpr</span><span class="o">(</span><span class="s">"CAST(key AS STRING)"</span><span class="o">,</span> <span class="s">"CAST(value AS STRING)"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">writeStream</span>
   <span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"kafka.bootstrap.servers"</span><span class="o">,</span> <span class="s">"host1:port1,host2:port2"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"topic"</span><span class="o">,</span> <span class="s">"topic1"</span><span class="o">)</span>
   <span class="o">.</span><span class="na">trigger</span><span class="o">(</span><span class="nc">Trigger</span><span class="o">.</span><span class="na">Continuous</span><span class="o">(</span><span class="s">"1 second"</span><span class="o">))</span>  <span class="c1">// only change in query</span>
   <span class="o">.</span><span class="na">start</span><span class="o">();</span></code></pre></figure>

   </div>

 </div>

 <p>A checkpoint interval of 1 second means that the continuous processing engine will record the progress of the query every second. The resulting checkpoints are in a format compatible with the micro-batch engine, hence any query can be restarted with any trigger. For example, a supported query started with the micro-batch mode can be restarted in continuous mode, and vice versa. Note that any time you switch to continuous mode, you will get at-least-once fault-tolerance guarantees.</p>

 <h2 id="supported-queries">Supported Queries</h2>

 <p>As of Spark 2.4, only the following type of queries are supported in the continuous processing mode.</p>

 <ul>
   <li><em>Operations</em>: Only map-like Dataset/DataFrame operations are supported in continuous mode, that is, only projections (<code class="language-plaintext highlighter-rouge">select</code>, <code class="language-plaintext highlighter-rouge">map</code>, <code class="language-plaintext highlighter-rouge">flatMap</code>, <code class="language-plaintext highlighter-rouge">mapPartitions</code>, etc.) and selections (<code class="language-plaintext highlighter-rouge">where</code>, <code class="language-plaintext highlighter-rouge">filter</code>, etc.).
     <ul>
       <li>All SQL functions are supported except aggregation functions (since aggregations are not yet supported), <code class="language-plaintext highlighter-rouge">current_timestamp()</code> and <code class="language-plaintext highlighter-rouge">current_date()</code> (deterministic computations using time is challenging).</li>
     </ul>
   </li>
   <li><em>Sources</em>:
     <ul>
       <li>Kafka source: All options are supported.</li>
       <li>Rate source: Good for testing. Only options that are supported in the continuous mode are <code class="language-plaintext highlighter-rouge">numPartitions</code> and <code class="language-plaintext highlighter-rouge">rowsPerSecond</code>.</li>
     </ul>
   </li>
   <li><em>Sinks</em>:
     <ul>
       <li>Kafka sink: All options are supported.</li>
       <li>Memory sink: Good for debugging.</li>
       <li>Console sink: Good for debugging. All options are supported. Note that the console will print every checkpoint interval that you have specified in the continuous trigger.</li>
     </ul>
   </li>
 </ul>

 <p>See <a href="./apis-on-dataframes-and-datasets.html#input-sources">Input Sources</a> and <a href="./apis-on-dataframes-and-datasets.html#output-sinks">Output Sinks</a> sections for more details on them. While the console sink is good for testing, the end-to-end low-latency processing can be best observed with Kafka as the source and sink, as this allows the engine to process the data and make the results available in the output topic within milliseconds of the input data being available in the input topic.</p>

 <h2 id="caveats">Caveats</h2>

 <ul>
   <li>Continuous processing engine launches multiple long-running tasks that continuously read data from sources, process it and continuously write to sinks. The number of tasks required by the query depends on how many partitions the query can read from the sources in parallel. Therefore, before starting a continuous processing query, you must ensure there are enough cores in the cluster to all the tasks in parallel. For example, if you are reading from a Kafka topic that has 10 partitions, then the cluster must have at least 10 cores for the query to make progress.</li>
   <li>Stopping a continuous processing stream may produce spurious task termination warnings. These can be safely ignored.</li>
   <li>There are currently no automatic retries of failed tasks. Any failure will lead to the query being stopped and it needs to be manually restarted from the checkpoint.</li>
 </ul>


                 </div>

              <!-- /container -->
         </div>

         <script src="../js/vendor/jquery-3.5.1.min.js"></script>
         <script src="../js/vendor/bootstrap.bundle.min.js"></script>

         <script src="../js/vendor/anchor.min.js"></script>
         <script src="../js/main.js"></script>

         <script type="text/javascript" src="../js/vendor/docsearch.min.js"></script>
         <script type="text/javascript">
             // DocSearch is entirely free and automated. DocSearch is built in two parts:
             // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
             //    in your website and extract content from every page it traverses. It then pushes this
             //    content to an Algolia index.
             // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
             //    to your search input and display its results in a dropdown UI. If you want to find more
             //    details on how works DocSearch, check the docs of DocSearch.
             docsearch({
     apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
     appId: 'RAI69RXRSK',
     indexName: 'apache_spark',
     inputSelector: '#docsearch-input',
     enhancedSearchInput: true,
     algoliaOptions: {
       'facetFilters': ["version:4.0.1"]
     },
     debug: false // Set debug to true if you want to inspect the dropdown
 });

         </script>

         <!-- MathJax Section -->
         <script type="text/x-mathjax-config">
             MathJax.Hub.Config({
                 TeX: { equationNumbers: { autoNumber: "AMS" } }
             });
         </script>
         <script>
             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
             // We could use "//cdn.mathjax...", but that won't support "file://".
             (function(d, script) {
                 script = d.createElement('script');
                 script.type = 'text/javascript';
                 script.async = true;
                 script.onload = function(){
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
                     });
                 };
                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                     'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
                     '?config=TeX-AMS-MML_HTMLorMML';
                 d.getElementsByTagName('head')[0].appendChild(script);
             }(document));
         </script>
     </body>
 </html>