| |
| |
| |
| |
| <!DOCTYPE html> |
| <html class="no-js"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Security - Spark 4.1.0-preview1 Documentation</title> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> |
| <link href="css/custom.css" rel="stylesheet"> |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="css/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body class="global"> |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.1.0-preview1</span> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav me-auto"> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="streaming/index.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| <a class="dropdown-item" href="declarative-pipelines-programming-guide.html">Declarative Pipelines</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v4.1.0-preview1</span></span>--> |
| </div> |
| </nav> |
| |
| |
| |
| <div class="container"> |
| |
| <div class="content mr-3" id="content"> |
| |
| |
| <h1 class="title">Spark Security</h1> |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#spark-security-things-you-need-to-know" id="markdown-toc-spark-security-things-you-need-to-know">Spark Security: Things You Need To Know</a></li> |
| <li><a href="#spark-rpc-communication-protocol-between-spark-processes" id="markdown-toc-spark-rpc-communication-protocol-between-spark-processes">Spark RPC (Communication protocol between Spark processes)</a> <ul> |
| <li><a href="#authentication" id="markdown-toc-authentication">Authentication</a> <ul> |
| <li><a href="#yarn" id="markdown-toc-yarn">YARN</a></li> |
| <li><a href="#kubernetes" id="markdown-toc-kubernetes">Kubernetes</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li><a href="#network-encryption" id="markdown-toc-network-encryption">Network Encryption</a> <ul> |
| <li><a href="#ssl-encryption-preferred" id="markdown-toc-ssl-encryption-preferred">SSL Encryption (Preferred)</a></li> |
| <li><a href="#aes-based-encryption-legacy" id="markdown-toc-aes-based-encryption-legacy">AES-based Encryption (Legacy)</a></li> |
| </ul> |
| </li> |
| <li><a href="#local-storage-encryption" id="markdown-toc-local-storage-encryption">Local Storage Encryption</a></li> |
| <li><a href="#web-ui" id="markdown-toc-web-ui">Web UI</a> <ul> |
| <li><a href="#authentication-and-authorization" id="markdown-toc-authentication-and-authorization">Authentication and Authorization</a></li> |
| <li><a href="#spark-history-server-acls" id="markdown-toc-spark-history-server-acls">Spark History Server ACLs</a></li> |
| <li><a href="#ssl-configuration" id="markdown-toc-ssl-configuration">SSL Configuration</a></li> |
| <li><a href="#preparing-the-key-stores" id="markdown-toc-preparing-the-key-stores">Preparing the key stores</a> <ul> |
| <li><a href="#yarn-mode" id="markdown-toc-yarn-mode">YARN mode</a></li> |
| <li><a href="#standalone-mode" id="markdown-toc-standalone-mode">Standalone mode</a></li> |
| </ul> |
| </li> |
| <li><a href="#http-security-headers" id="markdown-toc-http-security-headers">HTTP Security Headers</a></li> |
| </ul> |
| </li> |
| <li><a href="#configuring-ports-for-network-security" id="markdown-toc-configuring-ports-for-network-security">Configuring Ports for Network Security</a> <ul> |
| <li><a href="#standalone-mode-only" id="markdown-toc-standalone-mode-only">Standalone mode only</a></li> |
| <li><a href="#all-cluster-managers" id="markdown-toc-all-cluster-managers">All cluster managers</a></li> |
| </ul> |
| </li> |
| <li><a href="#kerberos" id="markdown-toc-kerberos">Kerberos</a> <ul> |
| <li><a href="#long-running-applications" id="markdown-toc-long-running-applications">Long-Running Applications</a> <ul> |
| <li><a href="#using-a-keytab" id="markdown-toc-using-a-keytab">Using a Keytab</a></li> |
| <li><a href="#using-a-ticket-cache" id="markdown-toc-using-a-ticket-cache">Using a ticket cache</a></li> |
| </ul> |
| </li> |
| <li><a href="#secure-interaction-with-kubernetes" id="markdown-toc-secure-interaction-with-kubernetes">Secure Interaction with Kubernetes</a></li> |
| </ul> |
| </li> |
| <li><a href="#event-logging" id="markdown-toc-event-logging">Event Logging</a></li> |
| <li><a href="#persisting-driver-logs-in-client-mode" id="markdown-toc-persisting-driver-logs-in-client-mode">Persisting driver logs in client mode</a></li> |
| </ul> |
| |
| <h1 id="spark-security-things-you-need-to-know">Spark Security: Things You Need To Know</h1> |
| |
| <p>Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet |
| or an untrusted network, it’s important to secure access to the cluster to prevent unauthorized applications |
| from running on the cluster.</p> |
| |
| <p>Spark supports multiple deployments types and each one supports different levels of security. Not |
| all deployment types will be secure in all environments and none are secure by default. Be |
| sure to evaluate your environment, what Spark supports, and take the appropriate measure to secure |
| your Spark deployment.</p> |
| |
| <p>There are many different types of security concerns. Spark does not necessarily protect against |
| all things. Listed below are some of the things Spark supports. Also check the deployment |
| documentation for the type of deployment you are using for deployment specific settings. Anything |
| not documented, Spark does not support.</p> |
| |
| <h1 id="spark-rpc-communication-protocol-between-spark-processes">Spark RPC (Communication protocol between Spark processes)</h1> |
| |
| <h2 id="authentication">Authentication</h2> |
| |
| <p>Spark currently supports authentication for RPC channels using a shared secret. Authentication can |
| be turned on by setting the <code class="language-plaintext highlighter-rouge">spark.authenticate</code> configuration parameter.</p> |
| |
| <p>The exact mechanism used to generate and distribute the shared secret is deployment-specific. Unless |
| specified below, the secret must be defined by setting the <code class="language-plaintext highlighter-rouge">spark.authenticate.secret</code> config |
| option. The same secret is shared by all Spark applications and daemons in that case, which limits |
| the security of these deployments, especially on multi-tenant clusters.</p> |
| |
| <p>The REST Submission Server supports HTTP <code class="language-plaintext highlighter-rouge">Authorization</code> header with |
| a cryptographically signed JSON Web Token via <code class="language-plaintext highlighter-rouge">JWSFilter</code>. |
| To enable authorization, Spark Master should have |
| <code class="language-plaintext highlighter-rouge">spark.master.rest.filters=org.apache.spark.ui.JWSFilter</code> and |
| <code class="language-plaintext highlighter-rouge">spark.org.apache.spark.ui.JWSFilter.param.secretKey=BASE64URL-ENCODED-KEY</code> configurations, and |
| client should provide HTTP <code class="language-plaintext highlighter-rouge">Authorization</code> header which contains JSON Web Token signed by |
| the shared secret key.</p> |
| |
| <h3 id="yarn">YARN</h3> |
| |
| <p>For Spark on <a href="running-on-yarn.html">YARN</a>, Spark will automatically handle generating and |
| distributing the shared secret. Each application will use a unique shared secret. In |
| the case of YARN, this feature relies on YARN RPC encryption being enabled for the distribution of |
| secrets to be secure.</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.yarn.shuffle.server.recovery.disabled</code></td> |
| <td>false</td> |
| <td> |
| Set to true for applications that have higher security requirements and prefer that their |
| secret is not saved in the db. The shuffle data of such applications will not be recovered after |
| the External Shuffle Service restarts. |
| </td> |
| <td>3.5.0</td> |
| </tr> |
| </table> |
| |
| <h3 id="kubernetes">Kubernetes</h3> |
| |
| <p>On Kubernetes, Spark will also automatically generate an authentication secret unique to each |
| application. The secret is propagated to executor pods using environment variables. This means |
| that any user that can list pods in the namespace where the Spark application is running can |
| also see their authentication secret. Access control rules should be properly set up by the |
| Kubernetes admin to ensure that Spark authentication is secure.</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.authenticate</code></td> |
| <td>false</td> |
| <td>Whether Spark authenticates its internal connections.</td> |
| <td>1.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.authenticate.secret</code></td> |
| <td>None</td> |
| <td> |
| The secret key used authentication. See above for when this configuration should be set. |
| </td> |
| <td>1.0.0</td> |
| </tr> |
| </table> |
| |
| <p>Alternatively, one can mount authentication secrets using files and Kubernetes secrets that |
| the user mounts into their pods.</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.authenticate.secret.file</code></td> |
| <td>None</td> |
| <td> |
| Path pointing to the secret key to use for securing connections. Ensure that the |
| contents of the file have been securely generated. This file is loaded on both the driver |
| and the executors unless other settings override this (see below). |
| </td> |
| <td>3.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.authenticate.secret.driver.file</code></td> |
| <td>The value of <code>spark.authenticate.secret.file</code></td> |
| <td> |
| When specified, overrides the location that the Spark driver reads to load the secret. |
| Useful when in client mode, when the location of the secret file may differ in the pod versus |
| the node the driver is running in. When this is specified, |
| <code>spark.authenticate.secret.executor.file</code> must be specified so that the driver |
| and the executors can both use files to load the secret key. Ensure that the contents of the file |
| on the driver is identical to the contents of the file on the executors. |
| </td> |
| <td>3.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.authenticate.secret.executor.file</code></td> |
| <td>The value of <code>spark.authenticate.secret.file</code></td> |
| <td> |
| When specified, overrides the location that the Spark executors read to load the secret. |
| Useful in client mode, when the location of the secret file may differ in the pod versus |
| the node the driver is running in. When this is specified, |
| <code>spark.authenticate.secret.driver.file</code> must be specified so that the driver |
| and the executors can both use files to load the secret key. Ensure that the contents of the file |
| on the driver is identical to the contents of the file on the executors. |
| </td> |
| <td>3.0.0</td> |
| </tr> |
| </table> |
| |
| <p>Note that when using files, Spark will not mount these files into the containers for you. It is up |
| you to ensure that the secret files are deployed securely into your containers and that the driver’s |
| secret file agrees with the executors’ secret file.</p> |
| |
| <h1 id="network-encryption">Network Encryption</h1> |
| |
| <p>Spark supports two mutually exclusive forms of encryption for RPC connections:</p> |
| |
| <p>The <strong>preferred method</strong> uses TLS (aka SSL) encryption via Netty’s support for SSL. Enabling SSL |
| requires keys and certificates to be properly configured. SSL is standardized and considered more |
| secure.</p> |
| |
| <p>The legacy method is an AES-based encryption mechanism relying on a shared secret. This requires |
| RPC authentication to also be enabled. This method uses a bespoke protocol and it is recommended |
| to use SSL instead.</p> |
| |
| <p>One may prefer to use the SSL based encryption in scenarios where compliance mandates the usage |
| of specific protocols; or to leverage the security of a more standard encryption library. However, |
| the AES based encryption is simpler to configure and may be preferred if the only requirement |
| is that data be encrypted in transit.</p> |
| |
| <p>If both options are enabled in the configuration, the SSL based RPC encryption takes precedence |
| and the AES based encryption will not be used (and a warning message will be emitted).</p> |
| |
| <h2 id="ssl-encryption-preferred">SSL Encryption (Preferred)</h2> |
| |
| <p>Spark supports SSL based encryption for RPC connections. Please refer to the SSL Configuration |
| section below to understand how to configure it. The SSL settings are mostly similar across the UI |
| and RPC, however there are a few additional settings which are specific to the RPC implementation. |
| The RPC implementation uses Netty under the hood (while the UI uses Jetty), which supports a |
| different set of options.</p> |
| |
| <p>Unlike the other SSL settings for the UI, the RPC SSL is <em>not</em> automatically enabled if |
| <code class="language-plaintext highlighter-rouge">spark.ssl.enabled</code> is set. It must be explicitly enabled, to ensure a safe migration path for users |
| upgrading Spark versions.</p> |
| |
| <h2 id="aes-based-encryption-legacy">AES-based Encryption (Legacy)</h2> |
| |
| <p>Spark supports AES-based encryption for RPC connections. For encryption to be enabled, RPC |
| authentication must also be enabled and properly configured. AES encryption uses the |
| <a href="https://commons.apache.org/proper/commons-crypto/">Apache Commons Crypto</a> library, and Spark’s |
| configuration system allows access to that library’s configuration for advanced users.</p> |
| |
| <p>This legacy protocol has two mutually incompatible versions. Version 1 omits applying key derivation function |
| (KDF) to the key exchange protocol’s output, while version 2 applies a KDF to ensure that the derived session |
| key is uniformly distributed. Version 1 is default for backward compatibility. It is <strong>recommended to use version 2</strong> |
| for better security properties. The version can be configured by setting <code class="language-plaintext highlighter-rouge">spark.network.crypto.authEngineVersion</code> to |
| 1 or 2 respectively.</p> |
| |
| <p>There is also support for SASL-based encryption, although it should be considered deprecated. It |
| is still required when talking to shuffle services from Spark versions older than 2.2.0.</p> |
| |
| <p>The following table describes the different options available for configuring this feature.</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.network.crypto.enabled</code></td> |
| <td>false</td> |
| <td> |
| Enable AES-based RPC encryption, including the new authentication protocol added in 2.2.0. |
| </td> |
| <td>2.2.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.network.crypto.cipher</code></td> |
| <td>AES/CTR/NoPadding</td> |
| <td> |
| Cipher mode to use. Defaults "AES/CTR/NoPadding" for backward compatibility, which is not authenticated. |
| Recommended to use "AES/GCM/NoPadding", which is an authenticated encryption mode. |
| </td> |
| <td>4.0.0, 3.5.2, 3.4.4</td> |
| </tr> |
| <tr> |
| <td><code>spark.network.crypto.authEngineVersion</code></td> |
| <td>1</td> |
| <td>Version of AES-based RPC encryption to use. Valid versions are 1 or 2. Version 2 is recommended.</td> |
| <td>4.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.network.crypto.config.*</code></td> |
| <td>None</td> |
| <td> |
| Configuration values for the commons-crypto library, such as which cipher implementations to |
| use. The config name should be the name of commons-crypto configuration without the |
| <code>commons.crypto</code> prefix. |
| </td> |
| <td>2.2.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.network.crypto.saslFallback</code></td> |
| <td>true</td> |
| <td> |
| Whether to fall back to SASL authentication if authentication fails using Spark's internal |
| mechanism. This is useful when the application is connecting to old shuffle services that |
| do not support the internal Spark authentication protocol. On the shuffle service side, |
| disabling this feature will block older clients from authenticating. |
| </td> |
| <td>2.2.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.authenticate.enableSaslEncryption</code></td> |
| <td>false</td> |
| <td> |
| Enable SASL-based encrypted communication. |
| </td> |
| <td>2.2.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.network.sasl.serverAlwaysEncrypt</code></td> |
| <td>false</td> |
| <td> |
| Disable unencrypted connections for ports using SASL authentication. This will deny connections |
| from clients that have authentication enabled, but do not request SASL-based encryption. |
| </td> |
| <td>1.4.0</td> |
| </tr> |
| </table> |
| |
| <h1 id="local-storage-encryption">Local Storage Encryption</h1> |
| |
| <p>Spark supports encrypting temporary data written to local disks. This covers shuffle files, shuffle |
| spills and data blocks stored on disk (for both caching and broadcast variables). It does not cover |
| encrypting output data generated by applications with APIs such as <code class="language-plaintext highlighter-rouge">saveAsHadoopFile</code> or |
| <code class="language-plaintext highlighter-rouge">saveAsTable</code>. It also may not cover temporary files created explicitly by the user.</p> |
| |
| <p>The following settings cover enabling encryption for data written to disk:</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.io.encryption.enabled</code></td> |
| <td>false</td> |
| <td> |
| Enable local disk I/O encryption. Currently supported by all modes. It's strongly |
| recommended that RPC encryption be enabled when using this feature. |
| </td> |
| <td>2.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.io.encryption.keySizeBits</code></td> |
| <td>128</td> |
| <td> |
| IO encryption key size in bits. Supported values are 128, 192 and 256. |
| </td> |
| <td>2.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.io.encryption.keygen.algorithm</code></td> |
| <td>HmacSHA1</td> |
| <td> |
| The algorithm to use when generating the IO encryption key. The supported algorithms are |
| described in the KeyGenerator section of the Java Cryptography Architecture Standard Algorithm |
| Name Documentation. |
| </td> |
| <td>2.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.io.encryption.commons.config.*</code></td> |
| <td>None</td> |
| <td> |
| Configuration values for the commons-crypto library, such as which cipher implementations to |
| use. The config name should be the name of commons-crypto configuration without the |
| <code>commons.crypto</code> prefix. |
| </td> |
| <td>2.1.0</td> |
| </tr> |
| </table> |
| |
| <h1 id="web-ui">Web UI</h1> |
| |
| <h2 id="authentication-and-authorization">Authentication and Authorization</h2> |
| |
| <p>Enabling authentication for the Web UIs is done using <a href="https://jakarta.ee/specifications/servlet/5.0/apidocs/jakarta/servlet/filter">jakarta servlet filters</a>. |
| You will need a filter that implements the authentication method you want to deploy. Spark does not |
| provide any built-in authentication filters.</p> |
| |
| <p>Spark also supports access control to the UI when an authentication filter is present. Each |
| application can be configured with its own separate access control lists (ACLs). Spark |
| differentiates between “view” permissions (who is allowed to see the application’s UI), and “modify” |
| permissions (who can do things like kill jobs in a running application).</p> |
| |
| <p>ACLs can be configured for either users or groups. Configuration entries accept comma-separated |
| lists as input, meaning multiple users or groups can be given the desired privileges. This can be |
| used if you run on a shared cluster and have a set of administrators or developers who need to |
| monitor applications they may not have started themselves. A wildcard (<code class="language-plaintext highlighter-rouge">*</code>) added to specific ACL |
| means that all users will have the respective privilege. By default, only the user submitting the |
| application is added to the ACLs.</p> |
| |
| <p>Group membership is established by using a configurable group mapping provider. The mapper is |
| configured using the <code>spark.user.groups.mapping</code> config option, described in the table |
| below.</p> |
| |
| <p>The following options control the authentication of Web UIs:</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.ui.allowFramingFrom</code></td> |
| <td><code>SAMEORIGIN</code></td> |
| <td>Allow framing for a specific named URI via <code>X-Frame-Options</code>. By default, allow only from the same origin.</td> |
| <td>1.6.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.ui.filters</code></td> |
| <td>None</td> |
| <td> |
| Spark supports HTTP <code>Authorization</code> header with a cryptographically signed |
| JSON Web Token via <code>org.apache.spark.ui.JWSFilter</code>. <br /> |
| See the <a href="configuration.html#spark-ui">Spark UI</a> configuration for how to configure |
| filters. |
| </td> |
| <td>1.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.acls.enable</code></td> |
| <td>false</td> |
| <td> |
| Whether UI ACLs should be enabled. If enabled, this checks to see if the user has access |
| permissions to view or modify the application. Note this requires the user to be authenticated, |
| so if no authentication filter is installed, this option does not do anything. |
| </td> |
| <td>1.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.admin.acls</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of users that have view and modify access to the Spark application. |
| </td> |
| <td>1.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.admin.acls.groups</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of groups that have view and modify access to the Spark application. |
| </td> |
| <td>2.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.modify.acls</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of users that have modify access to the Spark application. |
| </td> |
| <td>1.1.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.modify.acls.groups</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of groups that have modify access to the Spark application. |
| </td> |
| <td>2.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.ui.view.acls</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of users that have view access to the Spark application. |
| </td> |
| <td>1.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.ui.view.acls.groups</code></td> |
| <td>None</td> |
| <td> |
| Comma-separated list of groups that have view access to the Spark application. |
| </td> |
| <td>2.0.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.user.groups.mapping</code></td> |
| <td><code>org.apache.spark.security.ShellBasedGroupsMappingProvider</code></td> |
| <td> |
| The list of groups for a user is determined by a group mapping service defined by the trait |
| <code>org.apache.spark.security.GroupMappingServiceProvider</code>, which can be configured by |
| this property. |
| |
| <br />By default, a Unix shell-based implementation is used, which collects this information |
| from the host OS. |
| |
| <br /><em>Note:</em> This implementation supports only Unix/Linux-based environments. |
| Windows environment is currently <b>not</b> supported. However, a new platform/protocol can |
| be supported by implementing the trait mentioned above. |
| </td> |
| <td>2.0.0</td> |
| </tr> |
| </table> |
| |
| <p>On YARN, the view and modify ACLs are provided to the YARN service when submitting applications, and |
| control who has the respective privileges via YARN interfaces.</p> |
| |
| <h2 id="spark-history-server-acls">Spark History Server ACLs</h2> |
| |
| <p>Authentication for the SHS Web UI is enabled the same way as for regular applications, using |
| servlet filters.</p> |
| |
| <p>To enable authorization in the SHS, a few extra options are used:</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.history.ui.acls.enable</code></td> |
| <td>false</td> |
| <td> |
| Specifies whether ACLs should be checked to authorize users viewing the applications in |
| the history server. If enabled, access control checks are performed regardless of what the |
| individual applications had set for <code>spark.ui.acls.enable</code>. The application owner |
| will always have authorization to view their own application and any users specified via |
| <code>spark.ui.view.acls</code> and groups specified via <code>spark.ui.view.acls.groups</code> |
| when the application was run will also have authorization to view that application. |
| If disabled, no access control checks are made for any application UIs available through |
| the history server. |
| </td> |
| <td>1.0.1</td> |
| </tr> |
| <tr> |
| <td><code>spark.history.ui.admin.acls</code></td> |
| <td>None</td> |
| <td> |
| Comma separated list of users that have view access to all the Spark applications in history |
| server. |
| </td> |
| <td>2.1.1</td> |
| </tr> |
| <tr> |
| <td><code>spark.history.ui.admin.acls.groups</code></td> |
| <td>None</td> |
| <td> |
| Comma separated list of groups that have view access to all the Spark applications in history |
| server. |
| </td> |
| <td>2.1.1</td> |
| </tr> |
| </table> |
| |
| <p>The SHS uses the same options to configure the group mapping provider as regular applications. |
| In this case, the group mapping provider will apply to all UIs server by the SHS, and individual |
| application configurations will be ignored.</p> |
| |
| <h2 id="ssl-configuration">SSL Configuration</h2> |
| |
| <p>Configuration for SSL is organized hierarchically. The user can configure the default SSL settings |
| which will be used for all the supported communication protocols unless they are overwritten by |
| protocol-specific settings. This way the user can easily provide the common settings for all the |
| protocols without disabling the ability to configure each one individually. Note that all settings |
| are inherited this way, <em>except</em> for <code class="language-plaintext highlighter-rouge">spark.ssl.rpc.enabled</code> which must be explicitly set.</p> |
| |
| <p>The following table describes the SSL configuration namespaces:</p> |
| |
| <table> |
| <thead> |
| <tr> |
| <th>Config Namespace</th> |
| <th>Component</th> |
| </tr> |
| </thead> |
| <tr> |
| <td><code>spark.ssl</code></td> |
| <td> |
| The default SSL configuration. These values will apply to all namespaces below, unless |
| explicitly overridden at the namespace level. |
| </td> |
| </tr> |
| <tr> |
| <td><code>spark.ssl.ui</code></td> |
| <td>Spark application Web UI</td> |
| </tr> |
| <tr> |
| <td><code>spark.ssl.standalone</code></td> |
| <td>Standalone Master / Worker Web UI</td> |
| </tr> |
| <tr> |
| <td><code>spark.ssl.historyServer</code></td> |
| <td>History Server Web UI</td> |
| </tr> |
| <tr> |
| <td><code>spark.ssl.rpc</code></td> |
| <td>Spark RPC communication</td> |
| </tr> |
| </table> |
| |
| <p>The full breakdown of available SSL options can be found below. The <code class="language-plaintext highlighter-rouge">${ns}</code> placeholder should be |
| replaced with one of the above namespaces.</p> |
| |
| <table> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Supported Namespaces</th></tr></thead> |
| <tr> |
| <td><code>${ns}.enabled</code></td> |
| <td>false</td> |
| <td>Enables SSL. When enabled, <code>${ns}.ssl.protocol</code> is required.</td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.port</code></td> |
| <td>None</td> |
| <td> |
| The port where the SSL service will listen on. |
| |
| <br />The port must be defined within a specific namespace configuration. The default |
| namespace is ignored when reading this configuration. |
| |
| <br />When not set, the SSL port will be derived from the non-SSL port for the |
| same service. A value of "0" will make the service bind to an ephemeral port. |
| </td> |
| <td>ui,standalone,historyServer</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.enabledAlgorithms</code></td> |
| <td>None</td> |
| <td> |
| A comma-separated list of ciphers. The specified ciphers must be supported by JVM. |
| |
| <br />The reference list of protocols can be found in the "JSSE Cipher Suite Names" section |
| of the Java security guide. The list for Java 17 can be found at |
| <a href="https://docs.oracle.com/en/java/javase/17/docs/specs/security/standard-names.html#jsse-cipher-suite-names">this</a> |
| page. |
| |
| <br />Note: If not set, the default cipher suite for the JRE will be used. |
| </td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.keyPassword</code></td> |
| <td>None</td> |
| <td> |
| The password to the private key in the key store. |
| </td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.keyStore</code></td> |
| <td>None</td> |
| <td> |
| Path to the key store file. The path can be absolute or relative to the directory in which the |
| process is started. |
| </td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.keyStorePassword</code></td> |
| <td>None</td> |
| <td>Password to the key store.</td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.keyStoreType</code></td> |
| <td>JKS</td> |
| <td>The type of the key store.</td> |
| <td>ui,standalone,historyServer</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.protocol</code></td> |
| <td>None</td> |
| <td> |
| TLS protocol to use. The protocol must be supported by JVM. |
| |
| <br />The reference list of protocols can be found in the "Additional JSSE Standard Names" |
| section of the Java security guide. For Java 17, the list can be found at |
| <a href="https://docs.oracle.com/en/java/javase/17/docs/specs/security/standard-names.html#additional-jsse-standard-names">this</a> |
| page. |
| </td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.needClientAuth</code></td> |
| <td>false</td> |
| <td> |
| Whether to require client authentication. |
| </td> |
| <td>ui,standalone,historyServer</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.trustStore</code></td> |
| <td>None</td> |
| <td> |
| Path to the trust store file. The path can be absolute or relative to the directory in which |
| the process is started. |
| </td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.trustStorePassword</code></td> |
| <td>None</td> |
| <td>Password for the trust store.</td> |
| <td>ui,standalone,historyServer,rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.trustStoreType</code></td> |
| <td>JKS</td> |
| <td>The type of the trust store.</td> |
| <td>ui,standalone,historyServer</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.openSSLEnabled</code></td> |
| <td>false</td> |
| <td> |
| Whether to use OpenSSL for cryptographic operations instead of the JDK SSL provider. |
| This setting requires the `certChain` and `privateKey` settings to be set. |
| This takes precedence over the `keyStore` and `trustStore` settings if both are specified. |
| If the OpenSSL library is not available at runtime, we will fall back to the JDK provider. |
| </td> |
| <td>rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.privateKey</code></td> |
| <td>None</td> |
| <td> |
| Path to the private key file in PEM format. The path can be absolute or relative to the |
| directory in which the process is started. |
| This setting is required when using the OpenSSL implementation. |
| </td> |
| <td>rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.privateKeyPassword</code></td> |
| <td>None</td> |
| <td> |
| The password to the above private key file in PEM format. |
| </td> |
| <td>rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.certChain</code></td> |
| <td>None</td> |
| <td> |
| Path to the certificate chain file in PEM format. The path can be absolute or relative to the |
| directory in which the process is started. |
| This setting is required when using the OpenSSL implementation. |
| </td> |
| <td>rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.trustStoreReloadingEnabled</code></td> |
| <td>false</td> |
| <td> |
| Whether the trust store should be reloaded periodically. |
| This setting is mostly only useful in standalone deployments, not k8s or yarn deployments. |
| </td> |
| <td>rpc</td> |
| </tr> |
| <tr> |
| <td><code>${ns}.trustStoreReloadIntervalMs</code></td> |
| <td>10000</td> |
| <td> |
| The interval at which the trust store should be reloaded (in milliseconds). |
| This setting is mostly only useful in standalone deployments, not k8s or yarn deployments. |
| </td> |
| <td>rpc</td> |
| </tr> |
| </table> |
| |
| <p>Spark also supports retrieving <code class="language-plaintext highlighter-rouge">${ns}.keyPassword</code>, <code class="language-plaintext highlighter-rouge">${ns}.keyStorePassword</code> and <code class="language-plaintext highlighter-rouge">${ns}.trustStorePassword</code> from |
| <a href="https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html">Hadoop Credential Providers</a>. |
| User could store password into credential file and make it accessible by different components, like:</p> |
| |
| <div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hadoop credential create spark.ssl.keyPassword -value password \ |
| -provider jceks://hdfs@nn1.example.com:9001/user/backup/ssl.jceks |
| </code></pre></div></div> |
| |
| <p>To configure the location of the credential provider, set the <code class="language-plaintext highlighter-rouge">hadoop.security.credential.provider.path</code> |
| config option in the Hadoop configuration used by Spark, like:</p> |
| |
| <div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <property> |
| <name>hadoop.security.credential.provider.path</name> |
| <value>jceks://hdfs@nn1.example.com:9001/user/backup/ssl.jceks</value> |
| </property> |
| </code></pre></div></div> |
| |
| <p>Or via SparkConf “spark.hadoop.hadoop.security.credential.provider.path=jceks://hdfs@nn1.example.com:9001/user/backup/ssl.jceks”.</p> |
| |
| <h2 id="preparing-the-key-stores">Preparing the key stores</h2> |
| |
| <p>Key stores can be generated by <code class="language-plaintext highlighter-rouge">keytool</code> program. The reference documentation for this tool for |
| Java 17 is <a href="https://docs.oracle.com/en/java/javase/17/docs/specs/man/keytool.html">here</a>. |
| The most basic steps to configure the key stores and the trust store for a Spark Standalone |
| deployment mode is as follows:</p> |
| |
| <ul> |
| <li>Generate a key pair for each node</li> |
| <li>Export the public key of the key pair to a file on each node</li> |
| <li>Import all exported public keys into a single trust store</li> |
| <li>Distribute the trust store to the cluster nodes</li> |
| </ul> |
| |
| <h3 id="yarn-mode">YARN mode</h3> |
| |
| <p>To provide a local trust store or key store file to drivers running in cluster mode, they can be |
| distributed with the application using the <code class="language-plaintext highlighter-rouge">--files</code> command line argument (or the equivalent |
| <code class="language-plaintext highlighter-rouge">spark.files</code> configuration). The files will be placed on the driver’s working directory, so the TLS |
| configuration should just reference the file name with no absolute path.</p> |
| |
| <p>Distributing local key stores this way may require the files to be staged in HDFS (or other similar |
| distributed file system used by the cluster), so it’s recommended that the underlying file system be |
| configured with security in mind (e.g. by enabling authentication and wire encryption).</p> |
| |
| <h3 id="standalone-mode">Standalone mode</h3> |
| |
| <p>The user needs to provide key stores and configuration options for master and workers. They have to |
| be set by attaching appropriate Java system properties in <code class="language-plaintext highlighter-rouge">SPARK_MASTER_OPTS</code> and in |
| <code class="language-plaintext highlighter-rouge">SPARK_WORKER_OPTS</code> environment variables, or just in <code class="language-plaintext highlighter-rouge">SPARK_DAEMON_JAVA_OPTS</code>.</p> |
| |
| <p>The user may allow the executors to use the SSL settings inherited from the worker process. That |
| can be accomplished by setting <code class="language-plaintext highlighter-rouge">spark.ssl.useNodeLocalConf</code> to <code class="language-plaintext highlighter-rouge">true</code>. In that case, the settings |
| provided by the user on the client side are not used.</p> |
| |
| <h2 id="http-security-headers">HTTP Security Headers</h2> |
| |
| <p>Apache Spark can be configured to include HTTP headers to aid in preventing Cross Site Scripting |
| (XSS), Cross-Frame Scripting (XFS), MIME-Sniffing, and also to enforce HTTP Strict Transport |
| Security.</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.ui.xXssProtection</code></td> |
| <td><code>1; mode=block</code></td> |
| <td> |
| Value for HTTP X-XSS-Protection response header. You can choose appropriate value |
| from below: |
| <ul> |
| <li><code>0</code> (Disables XSS filtering)</li> |
| <li><code>1</code> (Enables XSS filtering. If a cross-site scripting attack is detected, |
| the browser will sanitize the page.)</li> |
| <li><code>1; mode=block</code> (Enables XSS filtering. The browser will prevent rendering |
| of the page if an attack is detected.)</li> |
| </ul> |
| </td> |
| <td>2.3.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.ui.xContentTypeOptions.enabled</code></td> |
| <td><code>true</code></td> |
| <td> |
| When enabled, X-Content-Type-Options HTTP response header will be set to "nosniff". |
| </td> |
| <td>2.3.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.ui.strictTransportSecurity</code></td> |
| <td>None</td> |
| <td> |
| Value for HTTP Strict Transport Security (HSTS) Response Header. You can choose appropriate |
| value from below and set <code>expire-time</code> accordingly. This option is only used when |
| SSL/TLS is enabled. |
| <ul> |
| <li><code>max-age=<expire-time></code></li> |
| <li><code>max-age=<expire-time>; includeSubDomains</code></li> |
| <li><code>max-age=<expire-time>; preload</code></li> |
| </ul> |
| </td> |
| <td>2.3.0</td> |
| </tr> |
| </table> |
| |
| <h1 id="configuring-ports-for-network-security">Configuring Ports for Network Security</h1> |
| |
| <p>Generally speaking, a Spark cluster and its services are not deployed on the public internet. |
| They are generally private services, and should only be accessible within the network of the |
| organization that deploys Spark. Access to the hosts and ports used by Spark services should |
| be limited to origin hosts that need to access the services.</p> |
| |
| <p>However, like the REST Submission port, Spark also supports HTTP <code class="language-plaintext highlighter-rouge">Authorization</code> header |
| with a cryptographically signed JSON Web Token (JWT) for all UI ports. |
| To use it, a user needs to configure |
| <code class="language-plaintext highlighter-rouge">spark.ui.filters=org.apache.spark.ui.JWSFilter</code> and |
| <code class="language-plaintext highlighter-rouge">spark.org.apache.spark.ui.JWSFilter.param.secretKey=BASE64URL-ENCODED-KEY</code>.</p> |
| |
| <p>Below are the primary ports that Spark uses for its communication and how to |
| configure those ports.</p> |
| |
| <h2 id="standalone-mode-only">Standalone mode only</h2> |
| |
| <table> |
| <thead> |
| <tr> |
| <th>From</th><th>To</th><th>Default Port</th><th>Purpose</th><th>Configuration |
| Setting</th><th>Notes</th> |
| </tr> |
| </thead> |
| <tr> |
| <td>Browser</td> |
| <td>Standalone Master</td> |
| <td>8080</td> |
| <td>Web UI</td> |
| <td><code>spark.master.ui.port /<br /> SPARK_MASTER_WEBUI_PORT</code></td> |
| <td>Jetty-based. Standalone mode only.</td> |
| </tr> |
| <tr> |
| <td>Browser</td> |
| <td>Standalone Worker</td> |
| <td>8081</td> |
| <td>Web UI</td> |
| <td><code>spark.worker.ui.port /<br /> SPARK_WORKER_WEBUI_PORT</code></td> |
| <td>Jetty-based. Standalone mode only.</td> |
| </tr> |
| <tr> |
| <td>Driver /<br /> Standalone Worker</td> |
| <td>Standalone Master</td> |
| <td>7077</td> |
| <td>Submit job to cluster /<br /> Join cluster</td> |
| <td><code>SPARK_MASTER_PORT</code></td> |
| <td>Set to "0" to choose a port randomly. Standalone mode only.</td> |
| </tr> |
| <tr> |
| <td>External Service</td> |
| <td>Standalone Master</td> |
| <td>6066</td> |
| <td>Submit job to cluster via REST API</td> |
| <td><code>spark.master.rest.port</code></td> |
| <td>Use <code>spark.master.rest.enabled</code> to enable/disable this service. Standalone mode only.</td> |
| </tr> |
| <tr> |
| <td>Standalone Master</td> |
| <td>Standalone Worker</td> |
| <td>(random)</td> |
| <td>Schedule executors</td> |
| <td><code>SPARK_WORKER_PORT</code></td> |
| <td>Set to "0" to choose a port randomly. Standalone mode only.</td> |
| </tr> |
| </table> |
| |
| <h2 id="all-cluster-managers">All cluster managers</h2> |
| |
| <table> |
| <thead> |
| <tr> |
| <th>From</th><th>To</th><th>Default Port</th><th>Purpose</th><th>Configuration |
| Setting</th><th>Notes</th> |
| </tr> |
| </thead> |
| <tr> |
| <td>Browser</td> |
| <td>Application</td> |
| <td>4040</td> |
| <td>Web UI</td> |
| <td><code>spark.ui.port</code></td> |
| <td>Jetty-based</td> |
| </tr> |
| <tr> |
| <td>Browser</td> |
| <td>History Server</td> |
| <td>18080</td> |
| <td>Web UI</td> |
| <td><code>spark.history.ui.port</code></td> |
| <td>Jetty-based</td> |
| </tr> |
| <tr> |
| <td>Executor /<br /> Standalone Master</td> |
| <td>Driver</td> |
| <td>(random)</td> |
| <td>Connect to application /<br /> Notify executor state changes</td> |
| <td><code>spark.driver.port</code></td> |
| <td>Set to "0" to choose a port randomly.</td> |
| </tr> |
| <tr> |
| <td>Executor / Driver</td> |
| <td>Executor / Driver</td> |
| <td>(random)</td> |
| <td>Block Manager port</td> |
| <td><code>spark.blockManager.port</code></td> |
| <td>Raw socket via ServerSocketChannel</td> |
| </tr> |
| </table> |
| |
| <h1 id="kerberos">Kerberos</h1> |
| |
| <p>Spark supports submitting applications in environments that use Kerberos for authentication. |
| In most cases, Spark relies on the credentials of the current logged in user when authenticating |
| to Kerberos-aware services. Such credentials can be obtained by logging in to the configured KDC |
| with tools like <code class="language-plaintext highlighter-rouge">kinit</code>.</p> |
| |
| <p>When talking to Hadoop-based services, Spark needs to obtain delegation tokens so that non-local |
| processes can authenticate. Spark ships with support for HDFS and other Hadoop file systems, Hive |
| and HBase.</p> |
| |
| <p>When using a Hadoop filesystem (such HDFS or WebHDFS), Spark will acquire the relevant tokens |
| for the service hosting the user’s home directory.</p> |
| |
| <p>An HBase token will be obtained if HBase is in the application’s classpath, and the HBase |
| configuration has Kerberos authentication turned (<code class="language-plaintext highlighter-rouge">hbase.security.authentication=kerberos</code>).</p> |
| |
| <p>Similarly, a Hive token will be obtained if Hive is in the classpath, and the configuration includes |
| URIs for remote metastore services (<code class="language-plaintext highlighter-rouge">hive.metastore.uris</code> is not empty).</p> |
| |
| <p>If an application needs to interact with other secure Hadoop filesystems, their URIs need to be |
| explicitly provided to Spark at launch time. This is done by listing them in the |
| <code class="language-plaintext highlighter-rouge">spark.kerberos.access.hadoopFileSystems</code> property, described in the configuration section below.</p> |
| |
| <p>Spark also supports custom delegation token providers using the Java Services |
| mechanism (see <code class="language-plaintext highlighter-rouge">java.util.ServiceLoader</code>). Implementations of |
| <code class="language-plaintext highlighter-rouge">org.apache.spark.security.HadoopDelegationTokenProvider</code> can be made available to Spark |
| by listing their names in the corresponding file in the jar’s <code class="language-plaintext highlighter-rouge">META-INF/services</code> directory.</p> |
| |
| <p>Delegation token support is currently only supported in YARN and Kubernetes mode. Consult the |
| deployment-specific page for more information.</p> |
| |
| <p>The following options provides finer-grained control for this feature:</p> |
| |
| <table class="spark-config"> |
| <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> |
| <tr> |
| <td><code>spark.security.credentials.${service}.enabled</code></td> |
| <td><code>true</code></td> |
| <td> |
| Controls whether to obtain credentials for services when security is enabled. |
| By default, credentials for all supported services are retrieved when those services are |
| configured, but it's possible to disable that behavior if it somehow conflicts with the |
| application being run. |
| </td> |
| <td>2.3.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.kerberos.access.hadoopFileSystems</code></td> |
| <td>(none)</td> |
| <td> |
| A comma-separated list of secure Hadoop filesystems your Spark application is going to access. For |
| example, <code>spark.kerberos.access.hadoopFileSystems=hdfs://nn1.com:8032,hdfs://nn2.com:8032, |
| webhdfs://nn3.com:50070</code>. The Spark application must have access to the filesystems listed |
| and Kerberos must be properly configured to be able to access them (either in the same realm |
| or in a trusted realm). Spark acquires security tokens for each of the filesystems so that |
| the Spark application can access those remote Hadoop filesystems. |
| </td> |
| <td>3.0.0</td> |
| </tr> |
| </table> |
| |
| <p>Users can exclude Kerberos delegation token renewal at resource scheduler. Currently it is only supported |
| on YARN. The configuration is covered in the <a href="running-on-yarn.html#yarn-specific-kerberos-configuration">Running Spark on YARN</a> page.</p> |
| |
| <h2 id="long-running-applications">Long-Running Applications</h2> |
| |
| <p>Long-running applications may run into issues if their run time exceeds the maximum delegation |
| token lifetime configured in services it needs to access.</p> |
| |
| <p>This feature is not available everywhere. In particular, it’s only implemented |
| on YARN and Kubernetes (both client and cluster modes).</p> |
| |
| <p>Spark supports automatically creating new tokens for these applications. There are two ways to |
| enable this functionality.</p> |
| |
| <h3 id="using-a-keytab">Using a Keytab</h3> |
| |
| <p>By providing Spark with a principal and keytab (e.g. using <code class="language-plaintext highlighter-rouge">spark-submit</code> with <code class="language-plaintext highlighter-rouge">--principal</code> |
| and <code class="language-plaintext highlighter-rouge">--keytab</code> parameters), the application will maintain a valid Kerberos login that can be |
| used to retrieve delegation tokens indefinitely.</p> |
| |
| <p>Note that when using a keytab in cluster mode, it will be copied over to the machine running the |
| Spark driver. In the case of YARN, this means using HDFS as a staging area for the keytab, so it’s |
| strongly recommended that both YARN and HDFS be secured with encryption, at least.</p> |
| |
| <h3 id="using-a-ticket-cache">Using a ticket cache</h3> |
| |
| <p>By setting <code class="language-plaintext highlighter-rouge">spark.kerberos.renewal.credentials</code> to <code class="language-plaintext highlighter-rouge">ccache</code> in Spark’s configuration, the local |
| Kerberos ticket cache will be used for authentication. Spark will keep the ticket renewed during its |
| renewable life, but after it expires a new ticket needs to be acquired (e.g. by running <code class="language-plaintext highlighter-rouge">kinit</code>).</p> |
| |
| <p>It’s up to the user to maintain an updated ticket cache that Spark can use.</p> |
| |
| <p>The location of the ticket cache can be customized by setting the <code class="language-plaintext highlighter-rouge">KRB5CCNAME</code> environment |
| variable.</p> |
| |
| <h2 id="secure-interaction-with-kubernetes">Secure Interaction with Kubernetes</h2> |
| |
| <p>When talking to Hadoop-based services behind Kerberos, it was noted that Spark needs to obtain delegation tokens |
| so that non-local processes can authenticate. These delegation tokens in Kubernetes are stored in Secrets that are |
| shared by the Driver and its Executors. As such, there are three ways of submitting a Kerberos job:</p> |
| |
| <p>In all cases you must define the environment variable: <code class="language-plaintext highlighter-rouge">HADOOP_CONF_DIR</code> or |
| <code class="language-plaintext highlighter-rouge">spark.kubernetes.hadoop.configMapName.</code></p> |
| |
| <p>It also important to note that the KDC needs to be visible from inside the containers.</p> |
| |
| <p>If a user wishes to use a remote HADOOP_CONF directory, that contains the Hadoop configuration files, this could be |
| achieved by setting <code class="language-plaintext highlighter-rouge">spark.kubernetes.hadoop.configMapName</code> to a pre-existing ConfigMap.</p> |
| |
| <ol> |
| <li>Submitting with a $kinit that stores a TGT in the Local Ticket Cache: |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>/usr/bin/kinit <span class="nt">-kt</span> <keytab_file> <username>/<krb5 realm> |
| /opt/spark/bin/spark-submit <span class="se">\</span> |
| <span class="nt">--deploy-mode</span> cluster <span class="se">\</span> |
| <span class="nt">--class</span> org.apache.spark.examples.HdfsTest <span class="se">\</span> |
| <span class="nt">--master</span> k8s://<KUBERNETES_MASTER_ENDPOINT> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.executor.instances<span class="o">=</span>1 <span class="se">\</span> |
| <span class="nt">--conf</span> spark.app.name<span class="o">=</span>spark-hdfs <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.container.image<span class="o">=</span>spark:latest <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.krb5.path<span class="o">=</span>/etc/krb5.conf <span class="se">\</span> |
| <span class="nb">local</span>:///opt/spark/examples/jars/spark-examples_<VERSION>.jar <span class="se">\</span> |
| <HDFS_FILE_LOCATION> |
| </code></pre></div> </div> |
| </li> |
| <li>Submitting with a local Keytab and Principal |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>/opt/spark/bin/spark-submit <span class="se">\</span> |
| <span class="nt">--deploy-mode</span> cluster <span class="se">\</span> |
| <span class="nt">--class</span> org.apache.spark.examples.HdfsTest <span class="se">\</span> |
| <span class="nt">--master</span> k8s://<KUBERNETES_MASTER_ENDPOINT> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.executor.instances<span class="o">=</span>1 <span class="se">\</span> |
| <span class="nt">--conf</span> spark.app.name<span class="o">=</span>spark-hdfs <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.container.image<span class="o">=</span>spark:latest <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kerberos.keytab<span class="o">=</span><KEYTAB_FILE> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kerberos.principal<span class="o">=</span><PRINCIPAL> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.krb5.path<span class="o">=</span>/etc/krb5.conf <span class="se">\</span> |
| <span class="nb">local</span>:///opt/spark/examples/jars/spark-examples_<VERSION>.jar <span class="se">\</span> |
| <HDFS_FILE_LOCATION> |
| </code></pre></div> </div> |
| </li> |
| <li>Submitting with pre-populated secrets, that contain the Delegation Token, already existing within the namespace |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>/opt/spark/bin/spark-submit <span class="se">\</span> |
| <span class="nt">--deploy-mode</span> cluster <span class="se">\</span> |
| <span class="nt">--class</span> org.apache.spark.examples.HdfsTest <span class="se">\</span> |
| <span class="nt">--master</span> k8s://<KUBERNETES_MASTER_ENDPOINT> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.executor.instances<span class="o">=</span>1 <span class="se">\</span> |
| <span class="nt">--conf</span> spark.app.name<span class="o">=</span>spark-hdfs <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.container.image<span class="o">=</span>spark:latest <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.tokenSecret.name<span class="o">=</span><SECRET_TOKEN_NAME> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.tokenSecret.itemKey<span class="o">=</span><SECRET_ITEM_KEY> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.krb5.path<span class="o">=</span>/etc/krb5.conf <span class="se">\</span> |
| <span class="nb">local</span>:///opt/spark/examples/jars/spark-examples_<VERSION>.jar <span class="se">\</span> |
| <HDFS_FILE_LOCATION> |
| </code></pre></div> </div> |
| </li> |
| </ol> |
| |
| <p>3b. Submitting like in (3) however specifying a pre-created krb5 ConfigMap and pre-created <code class="language-plaintext highlighter-rouge">HADOOP_CONF_DIR</code> ConfigMap</p> |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>/opt/spark/bin/spark-submit <span class="se">\</span> |
| <span class="nt">--deploy-mode</span> cluster <span class="se">\</span> |
| <span class="nt">--class</span> org.apache.spark.examples.HdfsTest <span class="se">\</span> |
| <span class="nt">--master</span> k8s://<KUBERNETES_MASTER_ENDPOINT> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.executor.instances<span class="o">=</span>1 <span class="se">\</span> |
| <span class="nt">--conf</span> spark.app.name<span class="o">=</span>spark-hdfs <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.container.image<span class="o">=</span>spark:latest <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.tokenSecret.name<span class="o">=</span><SECRET_TOKEN_NAME> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.tokenSecret.itemKey<span class="o">=</span><SECRET_ITEM_KEY> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.hadoop.configMapName<span class="o">=</span><HCONF_CONFIG_MAP_NAME> <span class="se">\</span> |
| <span class="nt">--conf</span> spark.kubernetes.kerberos.krb5.configMapName<span class="o">=</span><KRB_CONFIG_MAP_NAME> <span class="se">\</span> |
| <span class="nb">local</span>:///opt/spark/examples/jars/spark-examples_<VERSION>.jar <span class="se">\</span> |
| <HDFS_FILE_LOCATION> |
| </code></pre></div></div> |
| <h1 id="event-logging">Event Logging</h1> |
| |
| <p>If your applications are using event logging, the directory where the event logs go |
| (<code class="language-plaintext highlighter-rouge">spark.eventLog.dir</code>) should be manually created with proper permissions. To secure the log files, |
| the directory permissions should be set to <code class="language-plaintext highlighter-rouge">drwxrwxrwxt</code>. The owner and group of the directory |
| should correspond to the super user who is running the Spark History Server.</p> |
| |
| <p>This will allow all users to write to the directory but will prevent unprivileged users from |
| reading, removing or renaming a file unless they own it. The event log files will be created by |
| Spark with permissions such that only the user and group have read and write access.</p> |
| |
| <h1 id="persisting-driver-logs-in-client-mode">Persisting driver logs in client mode</h1> |
| |
| <p>If your applications persist driver logs in client mode by enabling <code class="language-plaintext highlighter-rouge">spark.driver.log.persistToDfs.enabled</code>, |
| the directory where the driver logs go (<code class="language-plaintext highlighter-rouge">spark.driver.log.dfsDir</code>) should be manually created with proper |
| permissions. To secure the log files, the directory permissions should be set to <code class="language-plaintext highlighter-rouge">drwxrwxrwxt</code>. The owner |
| and group of the directory should correspond to the super user who is running the Spark History Server.</p> |
| |
| <p>This will allow all users to write to the directory but will prevent unprivileged users from |
| reading, removing or renaming a file unless they own it. The driver log files will be created by |
| Spark with permissions such that only the user and group have read and write access.</p> |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <script type="text/javascript" src="js/vendor/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:4.1.0-preview1"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |