blob: 9dc0a71ad1db7658338c38b4d427aeeaf7b7cd83 [file] [log] [blame]
<!DOCTYPE html>
<html class="no-js">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Hardware Provisioning - Spark 4.1.0-preview1 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.1.0-preview1</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="streaming/index.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
<a class="dropdown-item" href="declarative-pipelines-programming-guide.html">Declarative Pipelines</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v4.1.0-preview1</span></span>-->
</div>
</nav>
<div class="container">
<div class="content mr-3" id="content">
<h1 class="title">Hardware Provisioning</h1>
<p>A common question received by Spark developers is how to configure hardware for it. While the right
hardware will depend on the situation, we make the following recommendations.</p>
<h1 id="storage-systems">Storage Systems</h1>
<p>Because most Spark jobs will likely have to read input data from an external storage system (e.g.
the Hadoop File System, or HBase), it is important to place it <strong>as close to this system as
possible</strong>. We recommend the following:</p>
<ul>
<li>
<p>If at all possible, run Spark on the same nodes as HDFS. The simplest way is to set up a Spark
<a href="spark-standalone.html">standalone mode cluster</a> on the same nodes, and configure Spark and
Hadoop&#8217;s memory and CPU usage to avoid interference (for Hadoop, the relevant options are
<code class="language-plaintext highlighter-rouge">mapred.child.java.opts</code> for the per-task memory and <code class="language-plaintext highlighter-rouge">mapreduce.tasktracker.map.tasks.maximum</code>
and <code class="language-plaintext highlighter-rouge">mapreduce.tasktracker.reduce.tasks.maximum</code> for number of tasks). Alternatively, you can run
Hadoop and Spark on a common cluster manager like <a href="running-on-yarn.html">Hadoop YARN</a>.</p>
</li>
<li>
<p>If this is not possible, run Spark on different nodes in the same local-area network as HDFS.</p>
</li>
<li>
<p>For low-latency data stores like HBase, it may be preferable to run computing jobs on different
nodes than the storage system to avoid interference.</p>
</li>
</ul>
<h1 id="local-disks">Local Disks</h1>
<p>While Spark can perform a lot of its computation in memory, it still uses local disks to store
data that doesn&#8217;t fit in RAM, as well as to preserve intermediate output between stages. We
recommend having <strong>4-8 disks</strong> per node, configured <em>without</em> RAID (just as separate mount points).
In Linux, mount the disks with the <code class="language-plaintext highlighter-rouge">noatime</code> option
to reduce unnecessary writes. In Spark, <a href="configuration.html">configure</a> the <code class="language-plaintext highlighter-rouge">spark.local.dir</code>
variable to be a comma-separated list of the local disks. If you are running HDFS, it&#8217;s fine to
use the same disks as HDFS.</p>
<h1 id="memory">Memory</h1>
<p>In general, Spark can run well with anywhere from <strong>8 GiB to hundreds of gigabytes</strong> of memory per
machine. In all cases, we recommend allocating only at most 75% of the memory for Spark; leave the
rest for the operating system and buffer cache.</p>
<p>How much memory you will need will depend on your application. To determine how much your
application uses for a certain dataset size, load part of your dataset in a Spark RDD and use the
Storage tab of Spark&#8217;s monitoring UI (<code class="language-plaintext highlighter-rouge">http://&lt;driver-node&gt;:4040</code>) to see its size in memory.
Note that memory usage is greatly affected by storage level and serialization format &#8211; see
the <a href="tuning.html">tuning guide</a> for tips on how to reduce it.</p>
<p>Finally, note that the Java VM does not always behave well with more than 200 GiB of RAM. If you
purchase machines with more RAM than this, you can launch multiple executors in a single node. In
Spark&#8217;s <a href="spark-standalone.html">standalone mode</a>, a worker is responsible for launching multiple
executors according to its available memory and cores, and each executor will be launched in a
separate Java VM.</p>
<h1 id="network">Network</h1>
<p>In our experience, when the data is in memory, a lot of Spark applications are network-bound.
Using a <strong>10 Gigabit</strong> or higher network is the best way to make these applications faster.
This is especially true for &#8220;distributed reduce&#8221; applications such as group-bys, reduce-bys, and
SQL joins. In any given application, you can see how much data Spark shuffles across the network
from the application&#8217;s monitoring UI (<code class="language-plaintext highlighter-rouge">http://&lt;driver-node&gt;:4040</code>).</p>
<h1 id="cpu-cores">CPU Cores</h1>
<p>Spark scales well to tens of CPU cores per machine because it performs minimal sharing between
threads. You should likely provision at least <strong>8-16 cores</strong> per machine. Depending on the CPU
cost of your workload, you may also need more: once data is in memory, most applications are
either CPU- or network-bound.</p>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:4.1.0-preview1"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>