blob: 3d7581d79c8bb2aa106c34fa9d724e6582406d36 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>
Contributing to Spark | Apache Spark
</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
<!-- Code highlighter CSS -->
<link href="/css/pygments-default.css" rel="stylesheet">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4" style="background: #1D6890;">
<a class="navbar-brand" href="/">
<img src="/images/spark-logo-rev.svg" alt="" width="141" height="72">
</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarContent"
aria-controls="navbarContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse col-md-12 col-lg-auto pt-4" id="navbarContent">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/downloads.html">Download</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="libraries" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Libraries
</a>
<ul class="dropdown-menu" aria-labelledby="libraries">
<li><a class="dropdown-item" href="/sql/">SQL and DataFrames</a></li>
<li><a class="dropdown-item" href="/spark-connect/">Spark Connect</a></li>
<li><a class="dropdown-item" href="/streaming/">Spark Streaming</a></li>
<li><a class="dropdown-item" href="/pandas-on-spark/">pandas on Spark</a></li>
<li><a class="dropdown-item" href="/mllib/">MLlib (machine learning)</a></li>
<li><a class="dropdown-item" href="/graphx/">GraphX (graph)</a></li>
<li>
<hr class="dropdown-divider">
</li>
<li><a class="dropdown-item" href="/third-party-projects.html">Third-Party Projects</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="documentation" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Documentation
</a>
<ul class="dropdown-menu" aria-labelledby="documentation">
<li><a class="dropdown-item" href="/docs/latest/">Latest Release</a></li>
<li><a class="dropdown-item" href="/documentation.html">Older Versions and Other Resources</a></li>
<li><a class="dropdown-item" href="/faq.html">Frequently Asked Questions</a></li>
</ul>
</li>
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/examples.html">Examples</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="community" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Community
</a>
<ul class="dropdown-menu" aria-labelledby="community">
<li><a class="dropdown-item" href="/community.html">Mailing Lists &amp; Resources</a></li>
<li><a class="dropdown-item" href="/contributing.html">Contributing to Spark</a></li>
<li><a class="dropdown-item" href="/improvement-proposals.html">Improvement Proposals (SPIP)</a>
</li>
<li><a class="dropdown-item" href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a>
</li>
<li><a class="dropdown-item" href="/powered-by.html">Powered By</a></li>
<li><a class="dropdown-item" href="/committers.html">Project Committers</a></li>
<li><a class="dropdown-item" href="/history.html">Project History</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="developers" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Developers
</a>
<ul class="dropdown-menu" aria-labelledby="developers">
<li><a class="dropdown-item" href="/developer-tools.html">Useful Developer Tools</a></li>
<li><a class="dropdown-item" href="/versioning-policy.html">Versioning Policy</a></li>
<li><a class="dropdown-item" href="/release-process.html">Release Process</a></li>
<li><a class="dropdown-item" href="/security.html">Security</a></li>
</ul>
</li>
</ul>
<ul class="navbar-nav ml-auto">
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="apacheFoundation" role="button"
data-bs-toggle="dropdown" aria-expanded="false">
Apache Software Foundation
</a>
<ul class="dropdown-menu" aria-labelledby="apacheFoundation">
<li><a class="dropdown-item" href="https://www.apache.org/">Apache Homepage</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/licenses/">License</a></li>
<li><a class="dropdown-item"
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/security/">Security</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/events/current-event">Event</a></li>
</ul>
</li>
</ul>
</div>
</nav>
<div class="container">
<div class="row mt-4">
<div class="col-12 col-md-9">
<p>This guide documents the best way to make various types of contribution to Apache Spark,
including what is required before submitting a code change.</p>
<p>Contributing to Spark doesn&#8217;t just mean writing code. Helping new users on the mailing list,
testing releases, and improving documentation are also welcome. In fact, proposing significant
code changes usually requires first gaining experience and credibility within the community by
helping in other ways. This is also a guide to becoming an effective contributor.</p>
<p>So, this guide organizes contributions in order that they should probably be considered by new
contributors who intend to get involved long-term. Build some track record of helping others,
rather than just open pull requests.</p>
<h2>Contributing by helping other users</h2>
<p>A great way to contribute to Spark is to help answer user questions on the <code class="language-plaintext highlighter-rouge">user@spark.apache.org</code>
mailing list or on StackOverflow. There are always many new Spark users; taking a few minutes to
help answer a question is a very valuable community service.</p>
<p>Contributors should subscribe to this list and follow it in order to keep up to date on what&#8217;s
happening in Spark. Answering questions is an excellent and visible way to help the community,
which also demonstrates your expertise.</p>
<p>See the <a href="/mailing-lists.html">Mailing Lists guide</a> for guidelines
about how to effectively participate in discussions on the mailing list, as well as forums
like StackOverflow.</p>
<h2>Contributing by testing releases</h2>
<p>Spark&#8217;s release process is community-oriented, and members of the community can vote on new
releases on the <code class="language-plaintext highlighter-rouge">dev@spark.apache.org</code> mailing list. Spark users are invited to subscribe to
this list to receive announcements, and test their workloads on newer release and provide
feedback on any performance or correctness issues found in the newer release.</p>
<h2>Contributing by reviewing changes</h2>
<p>Changes to Spark source code are proposed, reviewed and committed via
<a href="https://github.com/apache/spark/pulls">GitHub pull requests</a> (described later).
Anyone can view and comment on active changes here.
Reviewing others&#8217; changes is a good way to learn how the change process works and gain exposure
to activity in various parts of the code. You can help by reviewing the changes and asking
questions or pointing out issues &#8211; as simple as typos or small issues of style.
See also <a href="https://spark-prs.appspot.com/">https://spark-prs.appspot.com/</a> for a
convenient way to view and filter open PRs.</p>
<h2>Contributing documentation changes</h2>
<p>To propose a change to <em>release</em> documentation (that is, docs that appear under
<a href="https://spark.apache.org/docs/">https://spark.apache.org/docs/</a>),
edit the Markdown source files in Spark&#8217;s
<a href="https://github.com/apache/spark/tree/master/docs"><code class="language-plaintext highlighter-rouge">docs/</code></a> directory,
whose <code class="language-plaintext highlighter-rouge">README</code> file shows how to build the documentation locally to test your changes.
The process to propose a doc change is otherwise the same as the process for proposing code
changes below.</p>
<p>To propose a change to the rest of the documentation (that is, docs that do <em>not</em> appear under
<a href="https://spark.apache.org/docs/">https://spark.apache.org/docs/</a>), similarly, edit the Markdown in the
<a href="https://github.com/apache/spark-website">spark-website repository</a> and open a pull request.</p>
<h2>Contributing user libraries to Spark</h2>
<p>Just as Java and Scala applications can access a huge selection of libraries and utilities,
none of which are part of Java or Scala themselves, Spark aims to support a rich ecosystem of
libraries. Many new useful utilities or features belong outside of Spark rather than in the core.
For example: language support probably has to be a part of core Spark, but, useful machine
learning algorithms can happily exist outside of MLlib.</p>
<p>To that end, large and independent new functionality is often rejected for inclusion in Spark
itself, but, can and should be hosted as a separate project and repository, and included in
the <a href="https://spark-packages.org/">spark-packages.org</a> collection.</p>
<h2>Contributing bug reports</h2>
<p>Ideally, bug reports are accompanied by a proposed code change to fix the bug. This isn&#8217;t
always possible, as those who discover a bug may not have the experience to fix it. A bug
may be reported by creating a JIRA but without creating a pull request (see below).</p>
<p>Bug reports are only useful however if they include enough information to understand, isolate
and ideally reproduce the bug. Simply encountering an error does not mean a bug should be
reported; as below, search JIRA and search and inquire on the Spark user / dev mailing lists
first. Unreproducible bugs, or simple error reports, may be closed.</p>
<p>It&#8217;s very helpful if the bug report has a description about how the bug was introduced, by
which commit, so that reviewers can easily understand the bug. It also helps committers to
decide how far the bug fix should be backported, when the pull request is merged. The pull
request to fix the bug should narrow down the problem to the root cause.</p>
<p>Performance regression is also one kind of bug. The pull request to fix a performance regression
must provide a benchmark to prove the problem is indeed fixed.</p>
<p>Note that, data correctness/data loss bugs are very serious. Make sure the corresponding bug
report JIRA ticket is labeled as <code class="language-plaintext highlighter-rouge">correctness</code> or <code class="language-plaintext highlighter-rouge">data-loss</code>. If the bug report doesn&#8217;t get
enough attention, please send an email to <code class="language-plaintext highlighter-rouge">dev@spark.apache.org</code>, to draw more attentions.</p>
<p>It is possible to propose new features as well. These are generally not helpful unless
accompanied by detail, such as a design document and/or code change. Large new contributions
should consider <a href="https://spark-packages.org/">spark-packages.org</a> first (see above),
or be discussed on the mailing
list first. Feature requests may be rejected, or closed after a long period of inactivity.</p>
<h2>Contributing to JIRA maintenance</h2>
<p>Given the sheer volume of issues raised in the Apache Spark JIRA, inevitably some issues are
duplicates, or become obsolete and eventually fixed otherwise, or can&#8217;t be reproduced, or could
benefit from more detail, and so on. It&#8217;s useful to help identify these issues and resolve them,
either by advancing the discussion or even resolving the JIRA. Most contributors are able to
directly resolve JIRAs. Use judgment in determining whether you are quite confident the issue
should be resolved, although changes can be easily undone. If in doubt, just leave a comment
on the JIRA.</p>
<p>When resolving JIRAs, observe a few useful conventions:</p>
<ul>
<li>Resolve as <strong>Fixed</strong> if there&#8217;s a change you can point to that resolved the issue
<ul>
<li>Set Fix Version(s), if and only if the resolution is Fixed</li>
<li>Set Assignee to the person who most contributed to the resolution, which is usually the person
who opened the PR that resolved the issue.</li>
<li>In case several people contributed, prefer to assign to the more &#8216;junior&#8217;, non-committer contributor</li>
</ul>
</li>
<li>For issues that can&#8217;t be reproduced against master as reported, resolve as <strong>Cannot Reproduce</strong>
<ul>
<li>Fixed is reasonable too, if it&#8217;s clear what other previous pull request resolved it. Link to it.</li>
</ul>
</li>
<li>If the issue is the same as or a subset of another issue, resolved as <strong>Duplicate</strong>
<ul>
<li>Make sure to link to the JIRA it duplicates</li>
<li>Prefer to resolve the issue that has less activity or discussion as the duplicate</li>
</ul>
</li>
<li>If the issue seems clearly obsolete and applies to issues or components that have changed
radically since it was opened, resolve as <strong>Not a Problem</strong></li>
<li>If the issue doesn&#8217;t make sense – not actionable, for example, a non-Spark issue, resolve
as <strong>Invalid</strong></li>
<li>If it&#8217;s a coherent issue, but there is a clear indication that there is not support or interest
in acting on it, then resolve as <strong>Won&#8217;t Fix</strong></li>
<li>Umbrellas are frequently marked <strong>Done</strong> if they are just container issues that don&#8217;t correspond
to an actionable change of their own</li>
</ul>
<h2>Preparing to contribute code changes</h2>
<h3>Choosing what to contribute</h3>
<p>Spark is an exceptionally busy project, with a new JIRA or pull request every few hours on average.
Review can take hours or days of committer time. Everyone benefits if contributors focus on
changes that are useful, clear, easy to evaluate, and already pass basic checks.</p>
<p>Sometimes, a contributor will already have a particular new change or bug in mind. If seeking
ideas, consult the list of starter tasks in JIRA, or ask the <code class="language-plaintext highlighter-rouge">user@spark.apache.org</code> mailing list.</p>
<p>Before proceeding, contributors should evaluate if the proposed change is likely to be relevant,
new and actionable:</p>
<ul>
<li>Is it clear that code must change? Proposing a JIRA and pull request is appropriate only when a
clear problem or change has been identified. If simply having trouble using Spark, use the mailing
lists first, rather than consider filing a JIRA or proposing a change. When in doubt, email
<code class="language-plaintext highlighter-rouge">user@spark.apache.org</code> first about the possible change</li>
<li>Search the <code class="language-plaintext highlighter-rouge">user@spark.apache.org</code> and <code class="language-plaintext highlighter-rouge">dev@spark.apache.org</code> mailing list
<a href="/community.html#mailing-lists">archives</a> for
related discussions.
Often, the problem has been discussed before, with a resolution that doesn&#8217;t require a code
change, or recording what kinds of changes will not be accepted as a resolution.</li>
<li>Search JIRA for existing issues:
<a href="https://issues.apache.org/jira/browse/SPARK">https://issues.apache.org/jira/browse/SPARK</a></li>
<li>Type <code class="language-plaintext highlighter-rouge">spark [search terms]</code> at the top right search box. If a logically similar issue already
exists, then contribute to the discussion on the existing JIRA and pull request first, instead of
creating a new one.</li>
<li>Is the scope of the change matched to the contributor&#8217;s level of experience? Anyone is qualified
to suggest a typo fix, but refactoring core scheduling logic requires much more understanding of
Spark. Some changes require building up experience first (see above).</li>
</ul>
<p>It&#8217;s worth reemphasizing that changes to the core of Spark, or to highly complex and important modules
like SQL and Catalyst, are more difficult to make correctly. They will be subjected to more scrutiny,
and held to a higher standard of review than changes to less critical code.</p>
<h3>MLlib-specific contribution guidelines</h3>
<p>While a rich set of algorithms is an important goal for MLLib, scaling the project requires
that maintainability, consistency, and code quality come first. New algorithms should:</p>
<ul>
<li>Be widely known</li>
<li>Be used and accepted (academic citations and concrete use cases can help justify this)</li>
<li>Be highly scalable</li>
<li>Be well documented</li>
<li>Have APIs consistent with other algorithms in MLLib that accomplish the same thing</li>
<li>Come with a reasonable expectation of developer support.</li>
<li>Have <code class="language-plaintext highlighter-rouge">@Since</code> annotation on public classes, methods, and variables.</li>
</ul>
<h3>Error message guidelines</h3>
<p>Exceptions thrown in Spark should be associated with standardized and actionable
error messages.</p>
<p>Error messages should answer the following questions:</p>
<ul>
<li><strong>What</strong> was the problem?</li>
<li><strong>Why</strong> did the problem happen?</li>
<li><strong>How</strong> can the problem be solved?</li>
</ul>
<p>When writing error messages, you should:</p>
<ul>
<li>Use active voice</li>
<li>Avoid time-based statements, such as promises of future support</li>
<li>Use the present tense to describe the error and provide suggestions</li>
<li>Provide concrete examples if the resolution is unclear</li>
<li>Avoid sounding accusatory, judgmental, or insulting</li>
<li>Be direct</li>
<li>Do not use programming jargon in user-facing errors</li>
</ul>
<p>See the <a href="/error-message-guidelines.html">error message guidelines</a> for more details.</p>
<h3>Code review criteria</h3>
<p>Before considering how to contribute code, it&#8217;s useful to understand how code is reviewed,
and why changes may be rejected. See the
<a href="https://google.github.io/eng-practices/review/">detailed guide for code reviewers</a>
from Google&#8217;s Engineering Practices documentation.
Simply put, changes that have many or large
positives, and few negative effects or risks, are much more likely to be merged, and merged quickly.
Risky and less valuable changes are very unlikely to be merged, and may be rejected outright
rather than receive iterations of review.</p>
<h4>Positives</h4>
<ul>
<li>Fixes the root cause of a bug in existing functionality</li>
<li>Adds functionality or fixes a problem needed by a large number of users</li>
<li>Simple, targeted</li>
<li>Maintains or improves consistency across Python, Java, Scala</li>
<li>Easily tested; has tests</li>
<li>Reduces complexity and lines of code</li>
<li>Change has already been discussed and is known to committers</li>
</ul>
<h4>Negatives, risks</h4>
<ul>
<li>Band-aids a symptom of a bug only</li>
<li>Introduces complex new functionality, especially an API that needs to be supported</li>
<li>Adds complexity that only helps a niche use case</li>
<li>Adds user-space functionality that does not need to be maintained in Spark, but could be hosted
externally and indexed by <a href="https://spark-packages.org/">spark-packages.org</a></li>
<li>Changes a public API or semantics (rarely allowed)</li>
<li>Adds large dependencies</li>
<li>Changes versions of existing dependencies</li>
<li>Adds a large amount of code</li>
<li>Makes lots of modifications in one &#8220;big bang&#8221; change</li>
</ul>
<h2>Contributing code changes</h2>
<p>Please review the preceding section before proposing a code change. This section documents how to do so.</p>
<p><strong>When you contribute code, you affirm that the contribution is your original work and that you
license the work to the project under the project&#8217;s open source license. Whether or not you state
this explicitly, by submitting any copyrighted material via pull request, email, or other means
you agree to license the material under the project&#8217;s open source license and warrant that you
have the legal authority to do so.</strong></p>
<h3>Cloning the Apache Spark<span class="tm">&trade;</span> source code</h3>
<p>If you are interested in working with the newest under-development code or contributing to Apache Spark development, you can check out the master branch from Git:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code># Master development branch
git clone git://github.com/apache/spark.git
</code></pre></div></div>
<p>Once you&#8217;ve downloaded Spark, you can find instructions for installing and building it on the <a href="/documentation.html">documentation page</a>.</p>
<h3>JIRA</h3>
<p>Generally, Spark uses JIRA to track logical issues, including bugs and improvements, and uses
GitHub pull requests to manage the review and merge of specific code changes. That is, JIRAs are
used to describe <em>what</em> should be fixed or changed, and high-level approaches, and pull requests
describe <em>how</em> to implement that change in the project&#8217;s source code. For example, major design
decisions are discussed in JIRA.</p>
<ol>
<li>Find the existing Spark JIRA that the change pertains to.
<ol>
<li>Do not create a new JIRA if creating a change to address an existing issue in JIRA; add to
the existing discussion and work instead</li>
<li>Look for existing pull requests that are linked from the JIRA, to understand if someone is
already working on the JIRA</li>
</ol>
</li>
<li>If the change is new, then it usually needs a new JIRA. However, trivial changes, where the
what should change is virtually the same as the how it should change do not require a JIRA.
Example: <code class="language-plaintext highlighter-rouge">Fix typos in Foo scaladoc</code></li>
<li>If required, create a new JIRA:
<ol>
<li>Provide a descriptive Title. &#8220;Update web UI&#8221; or &#8220;Problem in scheduler&#8221; is not sufficient.
&#8220;Kafka Streaming support fails to handle empty queue in YARN cluster mode&#8221; is good.</li>
<li>Write a detailed Description. For bug reports, this should ideally include a short
reproduction of the problem. For new features, it may include a design document.</li>
<li>Set required fields:
<ol>
<li><strong>Issue Type</strong>. Generally, Bug, Improvement and New Feature are the only types used in Spark.</li>
<li><strong>Priority</strong>. Set to Major or below; higher priorities are generally reserved for
committers to set. The main exception is correctness or data-loss issues, which can be flagged as
Blockers. JIRA tends to unfortunately conflate &#8220;size&#8221; and &#8220;importance&#8221; in its
Priority field values. Their meaning is roughly:
<ol>
<li>Blocker: pointless to release without this change as the release would be unusable
to a large minority of users. Correctness and data loss issues should be considered Blockers for their target versions.</li>
<li>Critical: a large minority of users are missing important functionality without
this, and/or a workaround is difficult</li>
<li>Major: a small minority of users are missing important functionality without this,
and there is a workaround</li>
<li>Minor: a niche use case is missing some support, but it does not affect usage or
is easily worked around</li>
<li>Trivial: a nice-to-have change but unlikely to be any problem in practice otherwise</li>
</ol>
</li>
<li><strong>Component</strong></li>
<li><strong>Affects Version</strong>. For Bugs, assign at least one version that is known to exhibit the
problem or need the change</li>
<li><strong>Label</strong>. Not widely used, except for the following:
<ul>
<li><code class="language-plaintext highlighter-rouge">correctness</code>: a correctness issue</li>
<li><code class="language-plaintext highlighter-rouge">data-loss</code>: a data loss issue</li>
<li><code class="language-plaintext highlighter-rouge">release-notes</code>: the change&#8217;s effects need mention in release notes. The JIRA or pull request
should include detail suitable for inclusion in release notes &#8211; see &#8220;Docs Text&#8221; below.</li>
<li><code class="language-plaintext highlighter-rouge">starter</code>: small, simple change suitable for new contributors</li>
</ul>
</li>
<li><strong>Docs Text</strong>: For issues that require an entry in the release notes, this should contain the
information that the release manager should include in Release Notes. This should include a short summary
of what behavior is impacted, and detail on what behavior changed. It can be provisionally filled out
when the JIRA is opened, but will likely need to be updated with final details when the issue is
resolved.</li>
</ol>
</li>
<li>Do not set the following fields:
<ol>
<li><strong>Fix Version</strong>. This is assigned by committers only when resolved.</li>
<li><strong>Target Version</strong>. This is assigned by committers to indicate a PR has been accepted for
possible fix by the target version.</li>
</ol>
</li>
<li>Do not include a patch file; pull requests are used to propose the actual change.</li>
</ol>
</li>
<li>If the change is a large change, consider inviting discussion on the issue at
<code class="language-plaintext highlighter-rouge">dev@spark.apache.org</code> first before proceeding to implement the change.</li>
</ol>
<h3>Pull request</h3>
<p>Before creating a pull request in Apache Spark, it is important to check if tests can pass on your branch because
our GitHub Actions workflows automatically run tests for your pull request/following commits
and every run burdens the limited resources of GitHub Actions in Apache Spark repository.
Below steps will take your through the process.</p>
<ol>
<li><a href="https://help.github.com/articles/fork-a-repo/">Fork</a> the GitHub repository at
<a href="https://github.com/apache/spark">https://github.com/apache/spark</a> if you haven&#8217;t already</li>
<li>Go to &#8220;Actions&#8221; tab on your forked repository and enable &#8220;Build and test&#8221; and &#8220;Report test results&#8221; workflows</li>
<li>Clone your fork and create a new branch</li>
<li>Consider whether documentation or tests need to be added or updated as part of the change,
and add them as needed.
<ol>
<li>When you add tests, make sure the tests are self-descriptive.</li>
<li>Also, you should consider writing a JIRA ID in the tests when your pull request targets to fix
a specific issue. In practice, usually it is added when a JIRA type is a bug or a PR adds
a couple of tests to an existing test class. See the examples below:
<ul>
<li>Scala
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>test("SPARK-12345: a short description of the test") {
...
</code></pre></div> </div>
</li>
<li>Java
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>@Test
public void testCase() {
// SPARK-12345: a short description of the test
...
</code></pre></div> </div>
</li>
<li>Python
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>def test_case(self):
# SPARK-12345: a short description of the test
...
</code></pre></div> </div>
</li>
<li>R
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>test_that("SPARK-12345: a short description of the test", {
...
</code></pre></div> </div>
</li>
</ul>
</li>
</ol>
</li>
<li>Consider whether benchmark results should be added or updated as part of the change, and add them as needed by
<a href="/developer-tools.html#github-workflow-benchmarks">Running benchmarks in your forked repository</a>
to generate benchmark results.</li>
<li>Run all tests with <code class="language-plaintext highlighter-rouge">./dev/run-tests</code> to verify that the code still compiles, passes tests, and
passes style checks.
If style checks fail, review the Code Style Guide below.</li>
<li>Push commits to your branch. This will trigger &#8220;Build and test&#8221; and &#8220;Report test results&#8221; workflows
on your forked repository and start testing and validating your changes.</li>
<li><a href="https://help.github.com/articles/using-pull-requests/">Open a pull request</a> against
the <code class="language-plaintext highlighter-rouge">master</code> branch of <code class="language-plaintext highlighter-rouge">apache/spark</code>. (Only in special cases would the PR be opened against other branches). This
will trigger workflows &#8220;On pull request*&#8221; (on Spark repo) that will look/watch for successful workflow runs on &#8220;your&#8221; forked repository (it will wait if one is running).
<ol>
<li>The PR title should be of the form <code class="language-plaintext highlighter-rouge">[SPARK-xxxx][COMPONENT] Title</code>, where <code class="language-plaintext highlighter-rouge">SPARK-xxxx</code> is
the relevant JIRA number, <code class="language-plaintext highlighter-rouge">COMPONENT </code>is one of the PR categories shown at
<a href="https://spark-prs.appspot.com/">spark-prs.appspot.com</a> and
Title may be the JIRA&#8217;s title or a more specific title describing the PR itself.</li>
<li>If the pull request is still a work in progress, and so is not ready to be merged,
but needs to be pushed to GitHub to facilitate review, then add <code class="language-plaintext highlighter-rouge">[WIP]</code> after the component.</li>
<li>Consider identifying committers or other contributors who have worked on the code being
changed. Find the file(s) in GitHub and click &#8220;Blame&#8221; to see a line-by-line annotation of
who changed the code last. You can add <code class="language-plaintext highlighter-rouge">@username</code> in the PR description to ping them
immediately.</li>
<li>Please state that the contribution is your original work and that you license the work
to the project under the project&#8217;s open source license.</li>
</ol>
</li>
<li>The related JIRA, if any, will be marked as &#8220;In Progress&#8221; and your pull request will
automatically be linked to it. There is no need to be the Assignee of the JIRA to work on it,
though you are welcome to comment that you have begun work.</li>
<li>If there is a change related to SparkR in your pull request, AppVeyor will be triggered
automatically to test SparkR on Windows, which takes roughly an hour. Similarly to the steps
above, fix failures and push new commits which will request the re-test in AppVeyor.</li>
</ol>
<h3>The review process</h3>
<ul>
<li>Other reviewers, including committers, may comment on the changes and suggest modifications.
Changes can be added by simply pushing more commits to the same branch.</li>
<li>Lively, polite, rapid technical debate is encouraged from everyone in the community. The outcome
may be a rejection of the entire change.</li>
<li>Keep in mind that changes to more critical parts of Spark, like its core and SQL components, will
be subjected to more review, and may require more testing and proof of its correctness than
other changes.</li>
<li>Reviewers can indicate that a change looks suitable for merging with a comment such as: &#8220;I think
this patch looks good&#8221;. Spark uses the LGTM convention for indicating the strongest level of
technical sign-off on a patch: simply comment with the word &#8220;LGTM&#8221;. It specifically means: &#8220;I&#8217;ve
looked at this thoroughly and take as much ownership as if I wrote the patch myself&#8221;. If you
comment LGTM you will be expected to help with bugs or follow-up issues on the patch. Consistent,
judicious use of LGTMs is a great way to gain credibility as a reviewer with the broader community.</li>
<li>Sometimes, other changes will be merged which conflict with your pull request&#8217;s changes. The
PR can&#8217;t be merged until the conflict is resolved. This can be resolved by, for example, adding a remote
to keep up with upstream changes by <code class="language-plaintext highlighter-rouge">git remote add upstream https://github.com/apache/spark.git</code>,
running <code class="language-plaintext highlighter-rouge">git fetch upstream</code> followed by <code class="language-plaintext highlighter-rouge">git rebase upstream/master</code> and resolving the conflicts by hand,
then pushing the result to your branch.</li>
<li>Try to be responsive to the discussion rather than let days pass between replies</li>
</ul>
<h3>Closing your pull request / JIRA</h3>
<ul>
<li>If a change is accepted, it will be merged and the pull request will automatically be closed,
along with the associated JIRA if any
<ul>
<li>Note that in the rare case you are asked to open a pull request against a branch besides
<code class="language-plaintext highlighter-rouge">master</code>, that you will actually have to close the pull request manually</li>
<li>The JIRA will be Assigned to the primary contributor to the change as a way of giving credit.
If the JIRA isn&#8217;t closed and/or Assigned promptly, comment on the JIRA.</li>
</ul>
</li>
<li>If your pull request is ultimately rejected, please close it promptly
<ul>
<li>&#8230; because committers can&#8217;t close PRs directly</li>
<li>Pull requests will be automatically closed by an automated process at Apache after about a
week if a committer has made a comment like &#8220;mind closing this PR?&#8221; This means that the
committer is specifically requesting that it be closed.</li>
</ul>
</li>
<li>If a pull request has gotten little or no attention, consider improving the description or
the change itself and ping likely reviewers again after a few days. Consider proposing a
change that&#8217;s easier to include, like a smaller and/or less invasive change.</li>
<li>If it has been reviewed but not taken up after weeks, after soliciting review from the
most relevant reviewers, or, has met with neutral reactions, the outcome may be considered a
&#8220;soft no&#8221;. It is helpful to withdraw and close the PR in this case.</li>
<li>If a pull request is closed because it is deemed not the right approach to resolve a JIRA,
then leave the JIRA open. However if the review makes it clear that the issue identified in
the JIRA is not going to be resolved by any pull request (not a problem, won&#8217;t fix) then also
resolve the JIRA.</li>
</ul>
<p><a name="code-style-guide"></a></p>
<h2>Code style guide</h2>
<p>Please follow the style of the existing codebase.</p>
<ul>
<li>For Python code, Apache Spark follows
<a href="http://legacy.python.org/dev/peps/pep-0008/">PEP 8</a> with one exception:
lines can be up to 100 characters in length, not 79.</li>
<li>For R code, Apache Spark follows
<a href="https://google.github.io/styleguide/Rguide.xml">Google&#8217;s R Style Guide</a> with three exceptions:
lines can be up to 100 characters in length, not 80, there is no limit on function name but it has a initial
lower case latter and S4 objects/methods are allowed.</li>
<li>For Java code, Apache Spark follows
<a href="http://www.oracle.com/technetwork/java/codeconvtoc-136057.html">Oracle&#8217;s Java code conventions</a> and
Scala guidelines below. The latter is preferred.</li>
<li>For Scala code, Apache Spark follows the official
<a href="http://docs.scala-lang.org/style/">Scala style guide</a> and
<a href="https://github.com/databricks/scala-style-guide">Databricks Scala guide</a>. The latter is preferred. To format Scala code, run ./dev/scalafmt prior to submitting a PR.</li>
</ul>
<h3>If in doubt</h3>
<p>If you&#8217;re not sure about the right style for something, try to follow the style of the existing
codebase. Look at whether there are other examples in the code that use your feature. Feel free
to ask on the <code class="language-plaintext highlighter-rouge">dev@spark.apache.org</code> list as well and/or ask committers.</p>
<h2>Code of conduct</h2>
<p>The Apache Spark project follows the <a href="https://www.apache.org/foundation/policies/conduct.html">Apache Software Foundation Code of Conduct</a>. The <a href="https://www.apache.org/foundation/policies/conduct.html">code of conduct</a> applies to all spaces managed by the Apache Software Foundation, including IRC, all public and private mailing lists, issue trackers, wikis, blogs, Twitter, and any other communication channel used by our communities. A code of conduct which is specific to in-person events (ie., conferences) is codified in the published ASF anti-harassment policy.</p>
<p>We expect this code of conduct to be honored by everyone who participates in the Apache community formally or informally, or claims any affiliation with the Foundation, in any Foundation-related activities and especially when representing the ASF, in any role.</p>
<p>This code <u>is not exhaustive or complete</u>. It serves to distill our common understanding of a collaborative, shared environment and goals. We expect it to be followed in spirit as much as in the letter, so that it can enrich all of us and the technical communities in which we participate.</p>
<p>For more information and specific guidelines, refer to the <a href="https://www.apache.org/foundation/policies/conduct.html">Apache Software Foundation Code of Conduct</a>.</p>
</div>
<div class="col-12 col-md-3">
<div class="news" style="margin-bottom: 20px;">
<h5>Latest News</h5>
<ul class="list-unstyled">
<li><a href="/news/spark-3-4-3-released.html">Spark 3.4.3 released</a>
<span class="small">(Apr 18, 2024)</span></li>
<li><a href="/news/spark-3-5-1-released.html">Spark 3.5.1 released</a>
<span class="small">(Feb 23, 2024)</span></li>
<li><a href="/news/spark-3-3-4-released.html">Spark 3.3.4 released</a>
<span class="small">(Dec 16, 2023)</span></li>
<li><a href="/news/spark-3-4-2-released.html">Spark 3.4.2 released</a>
<span class="small">(Nov 30, 2023)</span></li>
</ul>
<p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
</div>
<div style="text-align:center; margin-bottom: 20px;">
<a href="https://www.apache.org/events/current-event.html">
<img src="https://www.apache.org/events/current-event-234x60.png" style="max-width: 100%;"/>
</a>
</div>
<div class="hidden-xs hidden-sm">
<a href="/downloads.html" class="btn btn-cta btn-lg d-grid" style="margin-bottom: 30px;">
Download Spark
</a>
<p style="font-size: 16px; font-weight: 500; color: #555;">
Built-in Libraries:
</p>
<ul class="list-none">
<li><a href="/sql/">SQL and DataFrames</a></li>
<li><a href="/streaming/">Spark Streaming</a></li>
<li><a href="/mllib/">MLlib (machine learning)</a></li>
<li><a href="/graphx/">GraphX (graph)</a></li>
</ul>
<a href="/third-party-projects.html">Third-Party Projects</a>
</div>
</div>
</div>
<footer class="small">
<hr>
Apache Spark, Spark, Apache, the Apache feather logo, and the Apache Spark project logo are either registered
trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
See guidance on use of Apache Spark <a href="/trademarks.html">trademarks</a>.
All other marks mentioned may be trademarks or registered trademarks of their respective owners.
Copyright &copy; 2018 The Apache Software Foundation, Licensed under the
<a href="https://www.apache.org/licenses/">Apache License, Version 2.0</a>.
</footer>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js"
integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM"
crossorigin="anonymous"></script>
<script src="https://code.jquery.com/jquery.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>
</body>
</html>