| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Integration with Cloud Infrastructures - Spark 2.4.4 Documentation</title> |
| |
| <meta name="description" content="Introduction to cloud storage support in Apache Spark 2.4.4"> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/bootstrap-responsive.min.css"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <div class="navbar navbar-fixed-top" id="topbar"> |
| <div class="navbar-inner"> |
| <div class="container"> |
| <div class="brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">2.4.4</span> |
| </div> |
| <ul class="nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li><a href="index.html">Overview</a></li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="quick-start.html">Quick Start</a></li> |
| <li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li> |
| <li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li> |
| <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li> |
| <li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li> |
| <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li> |
| <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> |
| <li><a href="sparkr.html">SparkR (R on Spark)</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li> |
| <li><a href="api/java/index.html">Java</a></li> |
| <li><a href="api/python/index.html">Python</a></li> |
| <li><a href="api/R/index.html">R</a></li> |
| <li><a href="api/sql/index.html">SQL, Built-in Functions</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="cluster-overview.html">Overview</a></li> |
| <li><a href="submitting-applications.html">Submitting Applications</a></li> |
| <li class="divider"></li> |
| <li><a href="spark-standalone.html">Spark Standalone</a></li> |
| <li><a href="running-on-mesos.html">Mesos</a></li> |
| <li><a href="running-on-yarn.html">YARN</a></li> |
| <li><a href="running-on-kubernetes.html">Kubernetes</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="configuration.html">Configuration</a></li> |
| <li><a href="monitoring.html">Monitoring</a></li> |
| <li><a href="tuning.html">Tuning Guide</a></li> |
| <li><a href="job-scheduling.html">Job Scheduling</a></li> |
| <li><a href="security.html">Security</a></li> |
| <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li> |
| <li class="divider"></li> |
| <li><a href="building-spark.html">Building Spark</a></li> |
| <li><a href="https://spark.apache.org/contributing.html">Contributing to Spark</a></li> |
| <li><a href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a></li> |
| </ul> |
| </li> |
| </ul> |
| <!--<p class="navbar-text pull-right"><span class="version-text">v2.4.4</span></p>--> |
| </div> |
| </div> |
| </div> |
| |
| <div class="container-wrapper"> |
| |
| |
| <div class="content" id="content"> |
| |
| <h1 class="title">Integration with Cloud Infrastructures</h1> |
| |
| |
| <!--- |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. See accompanying LICENSE file. |
| --> |
| |
| <ul id="markdown-toc"> |
| <li><a href="#introduction" id="markdown-toc-introduction">Introduction</a> <ul> |
| <li><a href="#important-cloud-object-stores-are-not-real-filesystems" id="markdown-toc-important-cloud-object-stores-are-not-real-filesystems">Important: Cloud Object Stores are Not Real Filesystems</a></li> |
| <li><a href="#installation" id="markdown-toc-installation">Installation</a></li> |
| <li><a href="#authenticating" id="markdown-toc-authenticating">Authenticating</a></li> |
| </ul> |
| </li> |
| <li><a href="#configuring" id="markdown-toc-configuring">Configuring</a> <ul> |
| <li><a href="#recommended-settings-for-writing-to-object-stores" id="markdown-toc-recommended-settings-for-writing-to-object-stores">Recommended settings for writing to object stores</a></li> |
| <li><a href="#parquet-io-settings" id="markdown-toc-parquet-io-settings">Parquet I/O Settings</a></li> |
| <li><a href="#orc-io-settings" id="markdown-toc-orc-io-settings">ORC I/O Settings</a></li> |
| </ul> |
| </li> |
| <li><a href="#spark-streaming-and-object-storage" id="markdown-toc-spark-streaming-and-object-storage">Spark Streaming and Object Storage</a></li> |
| <li><a href="#further-reading" id="markdown-toc-further-reading">Further Reading</a></li> |
| </ul> |
| |
| <h2 id="introduction">Introduction</h2> |
| |
| <p>All major cloud providers offer persistent data storage in <em>object stores</em>. |
| These are not classic “POSIX” file systems. |
| In order to store hundreds of petabytes of data without any single points of failure, |
| object stores replace the classic file system directory tree |
| with a simpler model of <code>object-name => data</code>. To enable remote access, operations |
| on objects are usually offered as (slow) HTTP REST operations.</p> |
| |
| <p>Spark can read and write data in object stores through filesystem connectors implemented |
| in Hadoop or provided by the infrastructure suppliers themselves. |
| These connectors make the object stores look <em>almost</em> like file systems, with directories and files |
| and the classic operations on them such as list, delete and rename.</p> |
| |
| <h3 id="important-cloud-object-stores-are-not-real-filesystems">Important: Cloud Object Stores are Not Real Filesystems</h3> |
| |
| <p>While the stores appear to be filesystems, underneath |
| they are still object stores, <a href="https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/filesystem/introduction.html">and the difference is significant</a></p> |
| |
| <p>They cannot be used as a direct replacement for a cluster filesystem such as HDFS |
| <em>except where this is explicitly stated</em>.</p> |
| |
| <p>Key differences are:</p> |
| |
| <ul> |
| <li>Changes to stored objects may not be immediately visible, both in directory listings and actual data access.</li> |
| <li>The means by which directories are emulated may make working with them slow.</li> |
| <li>Rename operations may be very slow and, on failure, leave the store in an unknown state.</li> |
| <li>Seeking within a file may require new HTTP calls, hurting performance.</li> |
| </ul> |
| |
| <p>How does this affect Spark?</p> |
| |
| <ol> |
| <li>Reading and writing data can be significantly slower than working with a normal filesystem.</li> |
| <li>Some directory structures may be very inefficient to scan during query split calculation.</li> |
| <li>The output of work may not be immediately visible to a follow-on query.</li> |
| <li>The rename-based algorithm by which Spark normally commits work when saving an RDD, DataFrame or Dataset |
| is potentially both slow and unreliable.</li> |
| </ol> |
| |
| <p>For these reasons, it is not always safe to use an object store as a direct destination of queries, or as |
| an intermediate store in a chain of queries. Consult the documentation of the object store and its |
| connector to determine which uses are considered safe.</p> |
| |
| <p>In particular: <em>without some form of consistency layer, Amazon S3 cannot |
| be safely used as the direct destination of work with the normal rename-based committer.</em></p> |
| |
| <h3 id="installation">Installation</h3> |
| |
| <p>With the relevant libraries on the classpath and Spark configured with valid credentials, |
| objects can be read or written by using their URLs as the path to data. |
| For example <code>sparkContext.textFile("s3a://landsat-pds/scene_list.gz")</code> will create |
| an RDD of the file <code>scene_list.gz</code> stored in S3, using the s3a connector.</p> |
| |
| <p>To add the relevant libraries to an application’s classpath, include the <code>hadoop-cloud</code> |
| module and its dependencies.</p> |
| |
| <p>In Maven, add the following to the <code>pom.xml</code> file, assuming <code>spark.version</code> |
| is set to the chosen version of Spark:</p> |
| |
| <figure class="highlight"><pre><code class="language-xml" data-lang="xml"><span></span><span class="nt"><dependencyManagement></span> |
| ... |
| <span class="nt"><dependency></span> |
| <span class="nt"><groupId></span>org.apache.spark<span class="nt"></groupId></span> |
| <span class="nt"><artifactId></span>hadoop-cloud_2.11<span class="nt"></artifactId></span> |
| <span class="nt"><version></span>${spark.version}<span class="nt"></version></span> |
| <span class="nt"><scope></span>provided<span class="nt"></scope></span> |
| <span class="nt"></dependency></span> |
| ... |
| <span class="nt"></dependencyManagement></span></code></pre></figure> |
| |
| <p>Commercial products based on Apache Spark generally directly set up the classpath |
| for talking to cloud infrastructures, in which case this module may not be needed.</p> |
| |
| <h3 id="authenticating">Authenticating</h3> |
| |
| <p>Spark jobs must authenticate with the object stores to access data within them.</p> |
| |
| <ol> |
| <li>When Spark is running in a cloud infrastructure, the credentials are usually automatically set up.</li> |
| <li><code>spark-submit</code> reads the <code>AWS_ACCESS_KEY</code>, <code>AWS_SECRET_KEY</code> |
| and <code>AWS_SESSION_TOKEN</code> environment variables and sets the associated authentication options |
| for the <code>s3n</code> and <code>s3a</code> connectors to Amazon S3.</li> |
| <li>In a Hadoop cluster, settings may be set in the <code>core-site.xml</code> file.</li> |
| <li>Authentication details may be manually added to the Spark configuration in <code>spark-defaults.conf</code></li> |
| <li>Alternatively, they can be programmatically set in the <code>SparkConf</code> instance used to configure |
| the application’s <code>SparkContext</code>.</li> |
| </ol> |
| |
| <p><em>Important: never check authentication secrets into source code repositories, |
| especially public ones</em></p> |
| |
| <p>Consult <a href="https://hadoop.apache.org/docs/current/">the Hadoop documentation</a> for the relevant |
| configuration and security options.</p> |
| |
| <h2 id="configuring">Configuring</h2> |
| |
| <p>Each cloud connector has its own set of configuration parameters, again, |
| consult the relevant documentation.</p> |
| |
| <h3 id="recommended-settings-for-writing-to-object-stores">Recommended settings for writing to object stores</h3> |
| |
| <p>For object stores whose consistency model means that rename-based commits are safe |
| use the <code>FileOutputCommitter</code> v2 algorithm for performance:</p> |
| |
| <pre><code>spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2 |
| </code></pre> |
| |
| <p>This does less renaming at the end of a job than the “version 1” algorithm. |
| As it still uses <code>rename()</code> to commit files, it is unsafe to use |
| when the object store does not have consistent metadata/listings.</p> |
| |
| <p>The committer can also be set to ignore failures when cleaning up temporary |
| files; this reduces the risk that a transient network problem is escalated into a |
| job failure:</p> |
| |
| <pre><code>spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored true |
| </code></pre> |
| |
| <p>As storing temporary files can run up charges; delete |
| directories called <code>"_temporary"</code> on a regular basis to avoid this.</p> |
| |
| <h3 id="parquet-io-settings">Parquet I/O Settings</h3> |
| |
| <p>For optimal performance when working with Parquet data use the following settings:</p> |
| |
| <pre><code>spark.hadoop.parquet.enable.summary-metadata false |
| spark.sql.parquet.mergeSchema false |
| spark.sql.parquet.filterPushdown true |
| spark.sql.hive.metastorePartitionPruning true |
| </code></pre> |
| |
| <p>These minimise the amount of data read during queries.</p> |
| |
| <h3 id="orc-io-settings">ORC I/O Settings</h3> |
| |
| <p>For best performance when working with ORC data, use these settings:</p> |
| |
| <pre><code>spark.sql.orc.filterPushdown true |
| spark.sql.orc.splits.include.file.footer true |
| spark.sql.orc.cache.stripe.details.size 10000 |
| spark.sql.hive.metastorePartitionPruning true |
| </code></pre> |
| |
| <p>Again, these minimise the amount of data read during queries.</p> |
| |
| <h2 id="spark-streaming-and-object-storage">Spark Streaming and Object Storage</h2> |
| |
| <p>Spark Streaming can monitor files added to object stores, by |
| creating a <code>FileInputDStream</code> to monitor a path in the store through a call to |
| <code>StreamingContext.textFileStream()</code>.</p> |
| |
| <ol> |
| <li> |
| <p>The time to scan for new files is proportional to the number of files |
| under the path, not the number of <em>new</em> files, so it can become a slow operation. |
| The size of the window needs to be set to handle this.</p> |
| </li> |
| <li> |
| <p>Files only appear in an object store once they are completely written; there |
| is no need for a workflow of write-then-rename to ensure that files aren’t picked up |
| while they are still being written. Applications can write straight to the monitored directory.</p> |
| </li> |
| <li> |
| <p>Streams should only be checkpointed to a store implementing a fast and |
| atomic <code>rename()</code> operation. |
| Otherwise the checkpointing may be slow and potentially unreliable.</p> |
| </li> |
| </ol> |
| |
| <h2 id="further-reading">Further Reading</h2> |
| |
| <p>Here is the documentation on the standard connectors both from Apache and the cloud providers.</p> |
| |
| <ul> |
| <li><a href="https://hadoop.apache.org/docs/current/hadoop-openstack/index.html">OpenStack Swift</a>. Hadoop 2.6+</li> |
| <li><a href="https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html">Azure Blob Storage</a>. Since Hadoop 2.7</li> |
| <li><a href="https://hadoop.apache.org/docs/current/hadoop-azure-datalake/index.html">Azure Data Lake</a>. Since Hadoop 2.8</li> |
| <li><a href="https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html">Amazon S3 via S3A and S3N</a>. Hadoop 2.6+</li> |
| <li><a href="https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-fs.html">Amazon EMR File System (EMRFS)</a>. From Amazon</li> |
| <li><a href="https://cloud.google.com/hadoop/google-cloud-storage-connector">Google Cloud Storage Connector for Spark and Hadoop</a>. From Google</li> |
| </ul> |
| |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-1.12.4.min.js"></script> |
| <script src="js/vendor/bootstrap.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |