blob: 30a3f4b2c476ba3c7b4013e53789eb08fb8fccd1 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>PySpark Overview &#8212; PySpark 4.0.0-preview2 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
<script src="_static/underscore.js"></script>
<script src="_static/doctools.js"></script>
<script src="_static/clipboard.min.js"></script>
<script src="_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'index';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/index.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Getting Started" href="getting_started/index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="#">
<img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="#">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "index.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="#">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "index.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article"></div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="pyspark-overview">
<h1>PySpark Overview<a class="headerlink" href="#pyspark-overview" title="Permalink to this headline">#</a></h1>
<p><strong>Date</strong>: Sep 16, 2024 <strong>Version</strong>: 4.0.0-preview2</p>
<p><strong>Useful links</strong>:
<a class="reference external" href="https://mybinder.org/v2/gh/apache/spark/f0d465e09b8?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb">Live Notebook</a> | <a class="reference external" href="https://github.com/apache/spark">GitHub</a> | <a class="reference external" href="https://issues.apache.org/jira/projects/SPARK/issues">Issues</a> | <a class="reference external" href="https://github.com/apache/spark/tree/f0d465e09b8/examples/src/main/python">Examples</a> | <a class="reference external" href="https://spark.apache.org/community.html">Community</a> | <a class="reference external" href="https://stackoverflow.com/questions/tagged/pyspark">Stack Overflow</a> | <a class="reference external" href="https://lists.apache.org/list.html?dev&#64;spark.apache.org">Dev Mailing List</a> | <a class="reference external" href="https://lists.apache.org/list.html?user&#64;spark.apache.org">User Mailing List</a></p>
<p>PySpark is the Python API for Apache Spark. It enables you to perform real-time,
large-scale data processing in a distributed environment using Python. It also provides a PySpark
shell for interactively analyzing your data.</p>
<p>PySpark combines Python’s learnability and ease of use with the power of Apache Spark
to enable processing and analysis of data at any size for everyone familiar with Python.</p>
<p>PySpark supports all of Spark’s features such as Spark SQL,
DataFrames, Structured Streaming, Machine Learning (MLlib) and Spark Core.</p>
<table class="colwidths-given borderless spec-table table">
<colgroup>
<col style="width: 10%" />
<col style="width: 20%" />
<col style="width: 20%" />
<col style="width: 20%" />
<col style="width: 20%" />
<col style="width: 10%" />
</colgroup>
<tbody>
<tr class="row-odd"><td></td>
<td><a class="reference external image-reference" href="reference/pyspark.sql/index.html"><img alt="Spark SQL" src="_images/pyspark-spark_sql_and_dataframes.png" style="width: 100%;" /></a>
</td>
<td><a class="reference external image-reference" href="reference/pyspark.pandas/index.html"><img alt="Pandas API on Spark" src="_images/pyspark-pandas_api_on_spark.png" style="width: 100%;" /></a>
</td>
<td><a class="reference external image-reference" href="reference/pyspark.ss/index.html"><img alt="Streaming" src="_images/pyspark-structured_streaming.png" style="width: 100%;" /></a>
</td>
<td><a class="reference external image-reference" href="reference/pyspark.ml.html"><img alt="Machine Learning" src="_images/pyspark-machine_learning.png" style="width: 100%;" /></a>
</td>
<td></td>
</tr>
</tbody>
</table>
<table class="colwidths-given borderless spec-table table">
<colgroup>
<col style="width: 10%" />
<col style="width: 80%" />
<col style="width: 10%" />
</colgroup>
<tbody>
<tr class="row-odd"><td></td>
<td><a class="reference external image-reference" href="reference/pyspark.html"><img alt="Spark Core and RDDs" src="_images/pyspark-spark_core_and_rdds.png" style="width: 100%;" /></a>
</td>
<td></td>
</tr>
</tbody>
</table>
<p id="index-page-spark-sql-and-dataframes"><strong>Spark SQL and DataFrames</strong></p>
<p>Spark SQL is Apache Spark’s module for working with structured data.
It allows you to seamlessly mix SQL queries with Spark programs.
With PySpark DataFrames you can efficiently read, write, transform,
and analyze data using Python and SQL.
Whether you use Python or SQL, the same underlying execution
engine is used so you will always leverage the full power of Spark.</p>
<ul class="simple">
<li><p><a class="reference internal" href="getting_started/quickstart_df.html"><span class="std std-ref">Quickstart: DataFrame</span></a></p></li>
<li><p><a class="reference external" href="https://mybinder.org/v2/gh/apache/spark/f0d465e09b8?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb">Live Notebook: DataFrame</a></p></li>
<li><p><a class="reference internal" href="reference/pyspark.sql/index.html"><span class="std std-ref">Spark SQL API Reference</span></a></p></li>
</ul>
<p><strong>Pandas API on Spark</strong></p>
<p>Pandas API on Spark allows you to scale your pandas workload to any size
by running it distributed across multiple nodes. If you are already familiar
with pandas and want to leverage Spark for big data, pandas API on Spark makes
you immediately productive and lets you migrate your applications without modifying the code.
You can have a single codebase that works both with pandas (tests, smaller datasets)
and with Spark (production, distributed datasets) and you can switch between the
pandas API and the Pandas API on Spark easily and without overhead.</p>
<p>Pandas API on Spark aims to make the transition from pandas to Spark easy but
if you are new to Spark or deciding which API to use, we recommend using PySpark
(see <a class="reference internal" href="#index-page-spark-sql-and-dataframes"><span class="std std-ref">Spark SQL and DataFrames</span></a>).</p>
<ul class="simple">
<li><p><a class="reference internal" href="getting_started/quickstart_ps.html"><span class="std std-ref">Quickstart: Pandas API on Spark</span></a></p></li>
<li><p><a class="reference external" href="https://mybinder.org/v2/gh/apache/spark/f0d465e09b8?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_ps.ipynb">Live Notebook: pandas API on Spark</a></p></li>
<li><p><a class="reference internal" href="reference/pyspark.pandas/index.html"><span class="std std-ref">Pandas API on Spark Reference</span></a></p></li>
</ul>
<p id="index-page-structured-streaming"><strong>Structured Streaming</strong></p>
<p>Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine.
You can express your streaming computation the same way you would express a batch computation on static data.
The Spark SQL engine will take care of running it incrementally and continuously and updating the final result
as streaming data continues to arrive.</p>
<ul class="simple">
<li><p><a class="reference external" href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">Structured Streaming Programming Guide</a></p></li>
<li><p><a class="reference internal" href="reference/pyspark.ss/index.html"><span class="std std-ref">Structured Streaming API Reference</span></a></p></li>
</ul>
<p><strong>Machine Learning (MLlib)</strong></p>
<p>Built on top of Spark, MLlib is a scalable machine learning library that provides
a uniform set of high-level APIs that help users create and tune practical machine
learning pipelines.</p>
<ul class="simple">
<li><p><a class="reference external" href="https://spark.apache.org/docs/latest/ml-guide.html">Machine Learning Library (MLlib) Programming Guide</a></p></li>
<li><p><a class="reference internal" href="reference/pyspark.ml.html"><span class="std std-ref">Machine Learning (MLlib) API Reference</span></a></p></li>
</ul>
<p><strong>Spark Core and RDDs</strong></p>
<p>Spark Core is the underlying general execution engine for the Spark platform that all
other functionality is built on top of. It provides RDDs (Resilient Distributed Datasets)
and in-memory computing capabilities.</p>
<p>Note that the RDD API is a low-level API which can be difficult to use and you do not get
the benefit of Spark’s automatic query optimization capabilities.
We recommend using DataFrames (see <a class="reference internal" href="#index-page-spark-sql-and-dataframes"><span class="std std-ref">Spark SQL and DataFrames</span></a> above)
instead of RDDs as it allows you to express what you want more easily and lets Spark automatically
construct the most efficient query for you.</p>
<ul class="simple">
<li><p><a class="reference internal" href="reference/pyspark.html"><span class="std std-ref">Spark Core API Reference</span></a></p></li>
</ul>
<p><strong>Spark Streaming (Legacy)</strong></p>
<p>Spark Streaming is an extension of the core Spark API that enables scalable,
high-throughput, fault-tolerant stream processing of live data streams.</p>
<p>Note that Spark Streaming is the previous generation of Spark’s streaming engine.
It is a legacy project and it is no longer being updated.
There is a newer and easier to use streaming engine in Spark called
<a class="reference internal" href="#index-page-structured-streaming"><span class="std std-ref">Structured Streaming</span></a> which you
should use for your streaming applications and pipelines.</p>
<ul class="simple">
<li><p><a class="reference external" href="https://spark.apache.org/docs/latest/streaming-programming-guide.html">Spark Streaming Programming Guide (Legacy)</a></p></li>
<li><p><a class="reference internal" href="reference/pyspark.streaming.html"><span class="std std-ref">Spark Streaming API Reference (Legacy)</span></a></p></li>
</ul>
<div class="toctree-wrapper compound">
</div>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="right-next"
href="getting_started/index.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Getting Started</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="_sources/index.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>