blob: 69dacfe41107523aa3d9e52481eaa9abc953ce72 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Contributing to PySpark &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'development/contributing';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/development/contributing.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Testing PySpark" href="testing.html" />
<link rel="prev" title="Development" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "development/contributing.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "development/contributing.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Contributing to PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="testing.html">Testing PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="debugging.html">Debugging PySpark</a></li>
<li class="toctree-l1"><a class="reference internal" href="setting_ide.html">Setting up IDEs</a></li>
<li class="toctree-l1"><a class="reference internal" href="errors.html">Error classes in PySpark</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">Development</a></li>
<li class="breadcrumb-item active" aria-current="page">Contributing to PySpark</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="contributing-to-pyspark">
<h1>Contributing to PySpark<a class="headerlink" href="#contributing-to-pyspark" title="Permalink to this headline">#</a></h1>
<p>There are many types of contribution, for example, helping other users, testing releases, reviewing changes,
documentation contribution, bug reporting, JIRA maintenance, code changes, etc.
These are documented at <a class="reference external" href="https://spark.apache.org/contributing.html">the general guidelines</a>.
This page focuses on PySpark and includes additional details specifically for PySpark.</p>
<section id="contributing-by-testing-releases">
<h2>Contributing by Testing Releases<a class="headerlink" href="#contributing-by-testing-releases" title="Permalink to this headline">#</a></h2>
<p>Before the official release, PySpark release candidates are shared in the <a class="reference external" href="https://mail-archives.apache.org/mod_mbox/spark-dev/">dev&#64;spark.apache.org</a> mailing list to vote on.
This release candidates can be easily installed via pip. For example, in case of Spark 3.0.0 RC1, you can install as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>https://dist.apache.org/repos/dist/dev/spark/v3.0.0-rc1-bin/pyspark-3.0.0.tar.gz
</pre></div>
</div>
<p>The link for release files such as <code class="docutils literal notranslate"><span class="pre">https://dist.apache.org/repos/dist/dev/spark/v3.0.0-rc1-bin</span></code> can be found in the vote thread.</p>
<p>Testing and verifying users’ existing workloads against release candidates is one of the vital contributions to PySpark.
It prevents breaking users’ existing workloads before the official release.
When there is an issue such as a regression, correctness problem or performance degradation worth enough to drop the release candidate,
usually the release candidate is dropped and the community focuses on fixing it to include in the next release candidate.</p>
</section>
<section id="contributing-documentation-changes">
<h2>Contributing Documentation Changes<a class="headerlink" href="#contributing-documentation-changes" title="Permalink to this headline">#</a></h2>
<p>The release documentation is located under Spark’s <a class="reference external" href="https://github.com/apache/spark/tree/master/docs">docs</a> directory.
<a class="reference external" href="https://github.com/apache/spark/blob/master/docs/README.md">README.md</a> describes the required dependencies and steps
to generate the documentations. Usually, PySpark documentation is tested with the command below
under the <a class="reference external" href="https://github.com/apache/spark/tree/master/docs">docs</a> directory:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">SKIP_SCALADOC</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">SKIP_RDOC</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">SKIP_SQLDOC</span><span class="o">=</span><span class="m">1</span><span class="w"> </span>bundle<span class="w"> </span><span class="nb">exec</span><span class="w"> </span>jekyll<span class="w"> </span>serve<span class="w"> </span>--watch
</pre></div>
</div>
<p>PySpark uses Sphinx to generate its release PySpark documentation. Therefore, if you want to build only PySpark documentation alone,
you can build under <a class="reference external" href="https://github.com/apache/spark/tree/master/python">python/docs</a> directory by:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>make<span class="w"> </span>html
</pre></div>
</div>
<p>It generates the corresponding HTMLs under <code class="docutils literal notranslate"><span class="pre">python/docs/build/html</span></code>.</p>
<p>Lastly, please make sure that the new APIs are documented by manually adding methods and/or classes at the corresponding RST files
under <code class="docutils literal notranslate"><span class="pre">python/docs/source/reference</span></code>. Otherwise, they would not be documented in PySpark documentation.</p>
</section>
<section id="preparing-to-contribute-code-changes">
<h2>Preparing to Contribute Code Changes<a class="headerlink" href="#preparing-to-contribute-code-changes" title="Permalink to this headline">#</a></h2>
<p>Before starting to work on codes in PySpark, it is recommended to read <a class="reference external" href="https://spark.apache.org/contributing.html">the general guidelines</a>.
Additionally, there are a couple of additional notes to keep in mind when contributing to codes in PySpark:</p>
<ul class="simple">
<li><dl class="simple">
<dt>Be Pythonic</dt><dd><p>See <a class="reference external" href="https://www.python.org/dev/peps/pep-0020/">The Zen of Python</a>.</p>
</dd>
</dl>
</li>
<li><dl class="simple">
<dt>Match APIs with Scala and Java sides</dt><dd><p>Apache Spark is an unified engine that provides a consistent API layer. In general, the APIs are consistently supported across other languages.</p>
</dd>
</dl>
</li>
<li><dl class="simple">
<dt>PySpark-specific APIs can be accepted</dt><dd><p>As long as they are Pythonic and do not conflict with other existent APIs, it is fine to raise a API request, for example, decorator usage of UDFs.</p>
</dd>
</dl>
</li>
<li><dl class="simple">
<dt>Adjust the corresponding type hints if you extend or modify public API</dt><dd><p>See <a class="reference internal" href="#contributing-and-maintaining-type-hints">Contributing and Maintaining Type Hints</a> for details.</p>
</dd>
</dl>
</li>
</ul>
<p>If you are fixing pandas API on Spark (<code class="docutils literal notranslate"><span class="pre">pyspark.pandas</span></code>) package, please consider the design principles below:</p>
<ul>
<li><dl class="simple">
<dt>Return pandas-on-Spark data structure for big data, and pandas data structure for small data</dt><dd><p>Often developers face the question whether a particular function should return a pandas-on-Spark DataFrame/Series, or a pandas DataFrame/Series. The principle is: if the returned object can be large, use a pandas-on-Spark DataFrame/Series. If the data is bound to be small, use a pandas DataFrame/Series. For example, <code class="docutils literal notranslate"><span class="pre">DataFrame.dtypes</span></code> return a pandas Series, because the number of columns in a DataFrame is bounded and small, whereas <code class="docutils literal notranslate"><span class="pre">DataFrame.head()</span></code> or <code class="docutils literal notranslate"><span class="pre">Series.unique()</span></code> returns a pandas-on-Spark DataFrame/Series, because the resulting object can be large.</p>
</dd>
</dl>
</li>
<li><dl>
<dt>Provide discoverable APIs for common data science tasks</dt><dd><p>At the risk of overgeneralization, there are two API design approaches: the first focuses on providing APIs for common tasks; the second starts with abstractions, and enables users to accomplish their tasks by composing primitives. While the world is not black and white, pandas takes more of the former approach, while Spark has taken more of the latter.</p>
<p>One example is value count (count by some key column), one of the most common operations in data science. pandas <code class="docutils literal notranslate"><span class="pre">DataFrame.value_counts()</span></code> returns the result in sorted order, which in 90% of the cases is what users prefer when exploring data, whereas Spark’s does not sort, which is more desirable when building data pipelines, as users can accomplish the pandas behavior by adding an explicit <code class="docutils literal notranslate"><span class="pre">orderBy</span></code>.</p>
<p>Similar to pandas, pandas API on Spark should also lean more towards the former, providing discoverable APIs for common data science tasks. In most cases, this principle is well taken care of by simply implementing pandas’ APIs. However, there will be circumstances in which pandas’ APIs don’t address a specific need, e.g. plotting for big data.</p>
</dd>
</dl>
</li>
<li><dl>
<dt>Guardrails to prevent users from shooting themselves in the foot</dt><dd><p>Certain operations in pandas are prohibitively expensive as data scales, and we don’t want to give users the illusion that they can rely on such operations in pandas API on Spark. That is to say, methods implemented in pandas API on Spark should be safe to perform by default on large datasets. As a result, the following capabilities are not implemented in pandas API on Spark:</p>
<ul class="simple">
<li><p>Capabilities that are fundamentally not parallelizable: e.g. imperatively looping over each element</p></li>
<li><p>Capabilities that require materializing the entire working set in a single node’s memory. This is why we do not implement <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_xarray.html">pandas.DataFrame.to_xarray</a>. Another example is the <code class="docutils literal notranslate"><span class="pre">_repr_html_</span></code> call caps the total number of records shown to a maximum of 1000, to prevent users from blowing up their driver node simply by typing the name of the DataFrame in a notebook.</p></li>
</ul>
<p>A few exceptions, however, exist. One common pattern with “big data science” is that while the initial dataset is large, the working set becomes smaller as the analysis goes deeper. For example, data scientists often perform aggregation on datasets and want to then convert the aggregated dataset to some local data structure. To help data scientists, we offer the following:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">DataFrame.to_pandas</span></code>: returns a pandas DataFrame (pandas-on-Spark only)</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">DataFrame.to_numpy</span></code>: returns a numpy array, works with both pandas and pandas API on Spark</p></li>
</ul>
<p>Note that it is clear from the names that these functions return some local data structure that would require materializing data in a single node’s memory. For these functions, we also explicitly document them with a warning note that the resulting data structure must be small.</p>
</dd>
</dl>
</li>
</ul>
</section>
<section id="environment-setup">
<h2>Environment Setup<a class="headerlink" href="#environment-setup" title="Permalink to this headline">#</a></h2>
<section id="prerequisite">
<h3>Prerequisite<a class="headerlink" href="#prerequisite" title="Permalink to this headline">#</a></h3>
<p>PySpark development requires to build Spark that needs a proper JDK installed, etc. See <a class="reference external" href="https://spark.apache.org/docs/latest/building-spark.html">Building Spark</a> for more details.</p>
<p>Note that if you intend to contribute to Spark Connect in Python, <code class="docutils literal notranslate"><span class="pre">buf</span></code> is required, see <a class="reference external" href="https://docs.buf.build/installation">Buf Installation</a> for more details.</p>
</section>
<section id="conda">
<h3>Conda<a class="headerlink" href="#conda" title="Permalink to this headline">#</a></h3>
<p>If you are using Conda, the development environment can be set as follows.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Python 3.9+ is required</span>
conda<span class="w"> </span>create<span class="w"> </span>--name<span class="w"> </span>pyspark-dev-env<span class="w"> </span><span class="nv">python</span><span class="o">=</span><span class="m">3</span>.9
conda<span class="w"> </span>activate<span class="w"> </span>pyspark-dev-env
pip<span class="w"> </span>install<span class="w"> </span>--upgrade<span class="w"> </span>-r<span class="w"> </span>dev/requirements.txt
</pre></div>
</div>
<p>Once it is set up, make sure you switch to <cite>pyspark-dev-env</cite> before starting the development:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>activate<span class="w"> </span>pyspark-dev-env
</pre></div>
</div>
<p>Now, you can start developing and <a class="reference internal" href="testing.html"><span class="doc">running the tests</span></a>.</p>
</section>
<section id="pip">
<h3>pip<a class="headerlink" href="#pip" title="Permalink to this headline">#</a></h3>
<p>With Python 3.9+, pip can be used as below to install and set up the development environment.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>--upgrade<span class="w"> </span>-r<span class="w"> </span>dev/requirements.txt
</pre></div>
</div>
<p>Now, you can start developing and <a class="reference internal" href="testing.html"><span class="doc">running the tests</span></a>.</p>
</section>
</section>
<section id="contributing-and-maintaining-type-hints">
<h2>Contributing and Maintaining Type Hints<a class="headerlink" href="#contributing-and-maintaining-type-hints" title="Permalink to this headline">#</a></h2>
<p>PySpark type hints are inlined, to take advantage of static type checking.</p>
<p>As a rule of thumb, only public API is annotated.</p>
<p>Annotations should, when possible:</p>
<ul>
<li><p>Reflect expectations of the underlying JVM API, to help avoid type related failures outside Python interpreter.</p></li>
<li><p>In case of conflict between too broad (<code class="docutils literal notranslate"><span class="pre">Any</span></code>) and too narrow argument annotations, prefer the latter as one, as long as it is covering most of the typical use cases.</p></li>
<li><p>Indicate nonsensical combinations of arguments using <code class="docutils literal notranslate"><span class="pre">&#64;overload</span></code> annotations. For example, to indicate that <code class="docutils literal notranslate"><span class="pre">*Col</span></code> and <code class="docutils literal notranslate"><span class="pre">*Cols</span></code> arguments are mutually exclusive:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> <span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> <span class="o">...</span>
</pre></div>
</div>
</li>
<li><p>Be compatible with the current stable MyPy release.</p></li>
</ul>
<p>Complex supporting type definitions, should be placed in dedicated <code class="docutils literal notranslate"><span class="pre">_typing.pyi</span></code> stubs. See for example <a class="reference external" href="https://github.com/apache/spark/blob/master/python/pyspark/sql/_typing.pyi">pyspark.sql._typing.pyi</a>.</p>
<p>Annotations can be validated using <code class="docutils literal notranslate"><span class="pre">dev/lint-python</span></code> script or by invoking mypy directly:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>mypy<span class="w"> </span>--config<span class="w"> </span>python/mypy.ini<span class="w"> </span>python/pyspark
</pre></div>
</div>
</section>
<section id="code-and-docstring-guide">
<h2>Code and Docstring Guide<a class="headerlink" href="#code-and-docstring-guide" title="Permalink to this headline">#</a></h2>
<section id="code-conventions">
<h3>Code Conventions<a class="headerlink" href="#code-conventions" title="Permalink to this headline">#</a></h3>
<p>Please follow the style of the existing codebase as is, which is virtually PEP 8 with one exception: lines can be up
to 100 characters in length, not 79.</p>
<p>Note that:</p>
<ul class="simple">
<li><p>the method and variable names in PySpark are the similar case is <code class="docutils literal notranslate"><span class="pre">threading</span></code> library in Python itself where the APIs were inspired by Java. PySpark also follows <cite>camelCase</cite> for exposed APIs that match with Scala and Java.</p></li>
<li><p>In contrast, <code class="docutils literal notranslate"><span class="pre">functions.py</span></code> uses <cite>snake_case</cite> in order to make APIs SQL (and Python) friendly.</p></li>
<li><p>In addition, pandas-on-Spark (<code class="docutils literal notranslate"><span class="pre">pyspark.pandas</span></code>) also uses <cite>snake_case</cite> because this package is free from API consistency with other languages.</p></li>
</ul>
<p>PySpark leverages linters such as <a class="reference external" href="https://pycodestyle.pycqa.org/en/latest/">pycodestyle</a> and <a class="reference external" href="https://flake8.pycqa.org/en/latest/">flake8</a>, which <code class="docutils literal notranslate"><span class="pre">dev/lint-python</span></code> runs. Therefore, make sure to run that script to double check.</p>
</section>
<section id="docstring-conventions">
<h3>Docstring Conventions<a class="headerlink" href="#docstring-conventions" title="Permalink to this headline">#</a></h3>
<p>PySpark follows <a class="reference external" href="https://numpydoc.readthedocs.io/en/latest/format.html">NumPy documentation style</a>.</p>
</section>
<section id="doctest-conventions">
<h3>Doctest Conventions<a class="headerlink" href="#doctest-conventions" title="Permalink to this headline">#</a></h3>
<p>In general, doctests should be grouped logically by separating a newline.</p>
<p>For instance, the first block is for the statements for preparation, the second block is for using the function with a specific argument,
and third block is for another argument. As a example, please refer <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rsub.html#pandas.DataFrame.rsub">DataFrame.rsub</a> in pandas.</p>
<p>These blocks should be consistently separated in PySpark doctests, and more doctests should be added if the coverage of the doctests or the number of examples to show is not enough.</p>
</section>
</section>
<section id="contributing-error-and-exception">
<h2>Contributing Error and Exception<a class="headerlink" href="#contributing-error-and-exception" title="Permalink to this headline">#</a></h2>
<p>To throw a standardized user-facing error or exception, developers should specify the error class and message parameters rather than an arbitrary error message.</p>
<section id="usage">
<h3>Usage<a class="headerlink" href="#usage" title="Permalink to this headline">#</a></h3>
<ol class="arabic simple">
<li><p>Check if an appropriate error class already exists in <a class="reference internal" href="errors.html#error-classes-in-pyspark"><span class="std std-ref">Error classes in PySpark</span></a>.
If true, use the error class and skip to step 3.</p></li>
<li><p>Add a new class to <a class="reference external" href="https://github.com/apache/spark/blob/master/python/pyspark/errors/error-conditions.json">error-conditions.json</a>; keep in mind the invariants below.</p></li>
<li><p>Check if the exception type already extends <cite>PySparkException</cite>.
If true, skip to step 5.</p></li>
<li><p>Mix <cite>PySparkException</cite> into the exception.</p></li>
<li><p>Throw the exception with the error class and message parameters.</p></li>
</ol>
<p><strong>Before</strong></p>
<p>Throw with arbitrary error message:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Problem A because B&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>After</strong></p>
<p><cite>error-conditions.json</cite></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="s2">&quot;PROBLEM_BECAUSE&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;message&quot;</span><span class="p">:</span> <span class="p">[</span><span class="s2">&quot;Problem &lt;problem&gt; because &lt;cause&gt;&quot;</span><span class="p">]</span>
<span class="p">}</span>
</pre></div>
</div>
<p><cite>exceptions.py</cite></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PySparkTestError</span><span class="p">(</span><span class="n">PySparkException</span><span class="p">):</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">error_class</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">message_parameters</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]):</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">error_class</span><span class="o">=</span><span class="n">error_class</span><span class="p">,</span> <span class="n">message_parameters</span><span class="o">=</span><span class="n">message_parameters</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMessageParameters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]:</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">getMessageParameters</span><span class="p">()</span>
</pre></div>
</div>
<p>Throw with error class and message parameters:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">raise</span> <span class="n">PySparkTestError</span><span class="p">(</span><span class="s2">&quot;PROBLEM_BECAUSE&quot;</span><span class="p">,</span> <span class="p">{</span><span class="s2">&quot;problem&quot;</span><span class="p">:</span> <span class="s2">&quot;A&quot;</span><span class="p">,</span> <span class="s2">&quot;cause&quot;</span><span class="p">:</span> <span class="s2">&quot;B&quot;</span><span class="p">})</span>
</pre></div>
</div>
</section>
<section id="access-fields">
<h3>Access fields<a class="headerlink" href="#access-fields" title="Permalink to this headline">#</a></h3>
<p>To access error fields, catch exceptions that extend <a class="reference internal" href="../reference/api/pyspark.errors.PySparkException.html#pyspark.errors.PySparkException" title="pyspark.errors.PySparkException"><code class="xref py py-class docutils literal notranslate"><span class="pre">PySparkException</span></code></a> and access to error class with <a class="reference internal" href="../reference/api/pyspark.errors.PySparkException.getErrorClass.html#pyspark.errors.PySparkException.getErrorClass" title="pyspark.errors.PySparkException.getErrorClass"><code class="xref py py-func docutils literal notranslate"><span class="pre">PySparkException.getErrorClass()</span></code></a>.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="p">:</span>
<span class="o">...</span>
<span class="k">except</span> <span class="n">PySparkException</span> <span class="k">as</span> <span class="n">pe</span><span class="p">:</span>
<span class="k">if</span> <span class="n">pe</span><span class="o">.</span><span class="n">getErrorClass</span><span class="p">()</span> <span class="o">==</span> <span class="s2">&quot;PROBLEM_BECAUSE&quot;</span><span class="p">:</span>
<span class="o">...</span>
</pre></div>
</div>
</section>
<section id="fields">
<h3>Fields<a class="headerlink" href="#fields" title="Permalink to this headline">#</a></h3>
<p><strong>Error class</strong></p>
<p>Error classes are a succinct, human-readable representation of the error category.</p>
<p>An uncategorized errors can be assigned to a legacy error class with the prefix <cite>_LEGACY_ERROR_TEMP_</cite> and an unused sequential number, for instance <cite>_LEGACY_ERROR_TEMP_0053</cite>.</p>
<p>Invariants:</p>
<ul class="simple">
<li><p>Unique</p></li>
<li><p>Consistent across releases</p></li>
<li><p>Sorted alphabetically</p></li>
</ul>
<p><strong>Message</strong></p>
<p>Error messages provide a descriptive, human-readable representation of the error.
The message format accepts string parameters via the C-style printf syntax.</p>
<p>The quality of the error message should match the <a class="reference external" href="https://spark.apache.org/error-message-guidelines.html">Apache Spark Error Message Guidelines</a></p>
<p>Invariants:</p>
<ul class="simple">
<li><p>Unique</p></li>
</ul>
</section>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="index.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Development</p>
</div>
</a>
<a class="right-next"
href="testing.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Testing PySpark</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#contributing-by-testing-releases">Contributing by Testing Releases</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#contributing-documentation-changes">Contributing Documentation Changes</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#preparing-to-contribute-code-changes">Preparing to Contribute Code Changes</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#environment-setup">Environment Setup</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisite">Prerequisite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#conda">Conda</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#pip">pip</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#contributing-and-maintaining-type-hints">Contributing and Maintaining Type Hints</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#code-and-docstring-guide">Code and Docstring Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#code-conventions">Code Conventions</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#docstring-conventions">Docstring Conventions</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#doctest-conventions">Doctest Conventions</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#contributing-error-and-exception">Contributing Error and Exception</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#access-fields">Access fields</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#fields">Fields</a></li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/development/contributing.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>