blob: 2f78f527dc3747a3869ae9e5156458fb8ed15656 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Generic File Source Options - Spark 3.5.0 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.0</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.0</span></span>-->
</div>
</nav>
<div class="container">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<ul>
<li>
<a href="sql-data-sources-load-save-functions.html">
Generic Load/Save Functions
</a>
</li>
<li>
<a href="sql-data-sources-generic-options.html">
Generic File Source Options
</a>
</li>
<li>
<a href="sql-data-sources-parquet.html">
Parquet Files
</a>
</li>
<li>
<a href="sql-data-sources-orc.html">
ORC Files
</a>
</li>
<li>
<a href="sql-data-sources-json.html">
JSON Files
</a>
</li>
<li>
<a href="sql-data-sources-csv.html">
CSV Files
</a>
</li>
<li>
<a href="sql-data-sources-text.html">
Text Files
</a>
</li>
<li>
<a href="sql-data-sources-hive-tables.html">
Hive Tables
</a>
</li>
<li>
<a href="sql-data-sources-jdbc.html">
JDBC To Other Databases
</a>
</li>
<li>
<a href="sql-data-sources-avro.html">
Avro Files
</a>
</li>
<li>
<a href="sql-data-sources-protobuf.html">
Protobuf data
</a>
</li>
<li>
<a href="sql-data-sources-binaryFile.html">
Whole Binary Files
</a>
</li>
<li>
<a href="sql-data-sources-troubleshooting.html">
Troubleshooting
</a>
</li>
</ul>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-guide.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<li>
<a href="sql-error-conditions.html">
Error Conditions
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Generic File Source Options</h1>
<ul id="markdown-toc">
<li><a href="#ignore-corrupt-files" id="markdown-toc-ignore-corrupt-files">Ignore Corrupt Files</a></li>
<li><a href="#ignore-missing-files" id="markdown-toc-ignore-missing-files">Ignore Missing Files</a></li>
<li><a href="#path-glob-filter" id="markdown-toc-path-glob-filter">Path Glob Filter</a></li>
<li><a href="#recursive-file-lookup" id="markdown-toc-recursive-file-lookup">Recursive File Lookup</a></li>
<li><a href="#modification-time-path-filters" id="markdown-toc-modification-time-path-filters">Modification Time Path Filters</a></li>
</ul>
<p>These generic options/configurations are effective only when using file-based sources: parquet, orc, avro, json, csv, text.</p>
<p>Please note that the hierarchy of directories used in examples below are:</p>
<figure class="highlight"><pre><code class="language-text" data-lang="text">dir1/
├── dir2/
│ └── file2.parquet (schema: &lt;file: string&gt;, content: "file2.parquet")
└── file1.parquet (schema: &lt;file, string&gt;, content: "file1.parquet")
└── file3.json (schema: &lt;file, string&gt;, content: "{'file':'corrupt.json'}")</code></pre></figure>
<h3 id="ignore-corrupt-files">Ignore Corrupt Files</h3>
<p>Spark allows you to use the configuration <code class="language-plaintext highlighter-rouge">spark.sql.files.ignoreCorruptFiles</code> or the data source option <code class="language-plaintext highlighter-rouge">ignoreCorruptFiles</code> to ignore corrupt files while reading data
from files. When set to true, the Spark jobs will continue to run when encountering corrupted files and
the contents that have been read will still be returned.</p>
<p>To ignore corrupt files while reading data files, you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="c1"># enable ignore corrupt files via the data source option
# dir1/file3.json is corrupt from parquet's view
</span><span class="n">test_corrupt_df0</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"ignoreCorruptFiles"</span><span class="p">,</span> <span class="s">"true"</span><span class="p">)</span>\
<span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1/"</span><span class="p">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="p">)</span>
<span class="n">test_corrupt_df0</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# |file1.parquet|
# |file2.parquet|
# +-------------+
</span>
<span class="c1"># enable ignore corrupt files via the configuration
</span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"set spark.sql.files.ignoreCorruptFiles=true"</span><span class="p">)</span>
<span class="c1"># dir1/file3.json is corrupt from parquet's view
</span><span class="n">test_corrupt_df1</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1/"</span><span class="p">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="p">)</span>
<span class="n">test_corrupt_df1</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# |file1.parquet|
# |file2.parquet|
# +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// enable ignore corrupt files via the data source option</span>
<span class="c1">// dir1/file3.json is corrupt from parquet's view</span>
<span class="k">val</span> <span class="nv">testCorruptDF0</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"ignoreCorruptFiles"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">).</span><span class="py">parquet</span><span class="o">(</span>
<span class="s">"examples/src/main/resources/dir1/"</span><span class="o">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="o">)</span>
<span class="nv">testCorruptDF0</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// enable ignore corrupt files via the configuration</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"set spark.sql.files.ignoreCorruptFiles=true"</span><span class="o">)</span>
<span class="c1">// dir1/file3.json is corrupt from parquet's view</span>
<span class="k">val</span> <span class="nv">testCorruptDF1</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span>
<span class="s">"examples/src/main/resources/dir1/"</span><span class="o">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="o">)</span>
<span class="nv">testCorruptDF1</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// enable ignore corrupt files via the data source option</span>
<span class="c1">// dir1/file3.json is corrupt from parquet's view</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">testCorruptDF0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">option</span><span class="o">(</span><span class="s">"ignoreCorruptFiles"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">).</span><span class="na">parquet</span><span class="o">(</span>
<span class="s">"examples/src/main/resources/dir1/"</span><span class="o">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="o">);</span>
<span class="n">testCorruptDF0</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// enable ignore corrupt files via the configuration</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"set spark.sql.files.ignoreCorruptFiles=true"</span><span class="o">);</span>
<span class="c1">// dir1/file3.json is corrupt from parquet's view</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">testCorruptDF1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span>
<span class="s">"examples/src/main/resources/dir1/"</span><span class="o">,</span>
<span class="s">"examples/src/main/resources/dir1/dir2/"</span><span class="o">);</span>
<span class="n">testCorruptDF1</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="c1"># enable ignore corrupt files via the data source option</span><span class="w">
</span><span class="c1"># dir1/file3.json is corrupt from parquet's view</span><span class="w">
</span><span class="n">testCorruptDF0</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.parquet</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1/"</span><span class="p">,</span><span class="w"> </span><span class="s2">"examples/src/main/resources/dir1/dir2/"</span><span class="p">),</span><span class="w"> </span><span class="n">ignoreCorruptFiles</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"true"</span><span class="p">)</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">testCorruptDF0</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span><span class="w">
</span><span class="c1"># 1 file1.parquet</span><span class="w">
</span><span class="c1"># 2 file2.parquet</span><span class="w">
</span><span class="c1"># enable ignore corrupt files via the configuration</span><span class="w">
</span><span class="n">sql</span><span class="p">(</span><span class="s2">"set spark.sql.files.ignoreCorruptFiles=true"</span><span class="p">)</span><span class="w">
</span><span class="c1"># dir1/file3.json is corrupt from parquet's view</span><span class="w">
</span><span class="n">testCorruptDF1</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.parquet</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1/"</span><span class="p">,</span><span class="w"> </span><span class="s2">"examples/src/main/resources/dir1/dir2/"</span><span class="p">))</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">testCorruptDF1</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span><span class="w">
</span><span class="c1"># 1 file1.parquet</span><span class="w">
</span><span class="c1"># 2 file2.parquet</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="ignore-missing-files">Ignore Missing Files</h3>
<p>Spark allows you to use the configuration <code class="language-plaintext highlighter-rouge">spark.sql.files.ignoreMissingFiles</code> or the data source option <code class="language-plaintext highlighter-rouge">ignoreMissingFiles</code> to ignore missing files while reading data
from files. Here, missing file really means the deleted file under directory after you construct the
<code class="language-plaintext highlighter-rouge">DataFrame</code>. When set to true, the Spark jobs will continue to run when encountering missing files and
the contents that have been read will still be returned.</p>
<h3 id="path-glob-filter">Path Glob Filter</h3>
<p><code class="language-plaintext highlighter-rouge">pathGlobFilter</code> is used to only include files with file names matching the pattern. The syntax follows
<code>org.apache.hadoop.fs.GlobFilter</code>. It does not change the behavior of partition discovery.</p>
<p>To load files with paths matching a given glob pattern while keeping the behavior of partition discovery,
you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="s">"parquet"</span><span class="p">,</span> <span class="n">pathGlobFilter</span><span class="o">=</span><span class="s">"*.parquet"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# |file1.parquet|
# +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">testGlobFilterDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"pathGlobFilter"</span><span class="o">,</span> <span class="s">"*.parquet"</span><span class="o">)</span> <span class="c1">// json file should be filtered out</span>
<span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">)</span>
<span class="nv">testGlobFilterDF</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">testGlobFilterDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"pathGlobFilter"</span><span class="o">,</span> <span class="s">"*.parquet"</span><span class="o">)</span> <span class="c1">// json file should be filtered out</span>
<span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">);</span>
<span class="n">testGlobFilterDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">pathGlobFilter</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"*.parquet"</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span><span class="w">
</span><span class="c1"># 1 file1.parquet</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="recursive-file-lookup">Recursive File Lookup</h3>
<p><code class="language-plaintext highlighter-rouge">recursiveFileLookup</code> is used to recursively load files and it disables partition inferring. Its default value is <code class="language-plaintext highlighter-rouge">false</code>.
If data source explicitly specifies the <code class="language-plaintext highlighter-rouge">partitionSpec</code> when <code class="language-plaintext highlighter-rouge">recursiveFileLookup</code> is true, exception will be thrown.</p>
<p>To load all files recursively, you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">recursive_loaded_df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nb">format</span><span class="p">(</span><span class="s">"parquet"</span><span class="p">)</span>\
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"recursiveFileLookup"</span><span class="p">,</span> <span class="s">"true"</span><span class="p">)</span>\
<span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="p">)</span>
<span class="n">recursive_loaded_df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# |file1.parquet|
# |file2.parquet|
# +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">recursiveLoadedDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"recursiveFileLookup"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">)</span>
<span class="nv">recursiveLoadedDF</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">recursiveLoadedDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"recursiveFileLookup"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">);</span>
<span class="n">recursiveLoadedDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// |file2.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">recursiveLoadedDF</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">recursiveFileLookup</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"true"</span><span class="p">)</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">recursiveLoadedDF</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span><span class="w">
</span><span class="c1"># 1 file1.parquet</span><span class="w">
</span><span class="c1"># 2 file2.parquet</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="modification-time-path-filters">Modification Time Path Filters</h3>
<p><code class="language-plaintext highlighter-rouge">modifiedBefore</code> and <code class="language-plaintext highlighter-rouge">modifiedAfter</code> are options that can be
applied together or separately in order to achieve greater
granularity over which files may load during a Spark batch query.
(Note that Structured Streaming file sources don&#8217;t support these options.)</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">modifiedBefore</code>: an optional timestamp to only include files with
modification times occurring before the specified time. The provided timestamp
must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)</li>
<li><code class="language-plaintext highlighter-rouge">modifiedAfter</code>: an optional timestamp to only include files with
modification times occurring after the specified time. The provided timestamp
must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)</li>
</ul>
<p>When a timezone option is not provided, the timestamps will be interpreted according
to the Spark session timezone (<code class="language-plaintext highlighter-rouge">spark.sql.session.timeZone</code>).</p>
<p>To load files with paths matching a given modified time range, you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="c1"># Only load files modified before 07/1/2050 @ 08:30:00
</span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="s">"parquet"</span><span class="p">,</span> <span class="n">modifiedBefore</span><span class="o">=</span><span class="s">"2050-07-01T08:30:00"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# |file1.parquet|
# +-------------+
# Only load files modified after 06/01/2050 @ 08:30:00
</span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="s">"parquet"</span><span class="p">,</span> <span class="n">modifiedAfter</span><span class="o">=</span><span class="s">"2050-06-01T08:30:00"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------------+
# | file|
# +-------------+
# +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">beforeFilterDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="c1">// Files modified before 07/01/2020 at 05:30 are allowed</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"modifiedBefore"</span><span class="o">,</span> <span class="s">"2020-07-01T05:30:00"</span><span class="o">)</span>
<span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">);</span>
<span class="nv">beforeFilterDF</span><span class="o">.</span><span class="py">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// +-------------+</span>
<span class="k">val</span> <span class="nv">afterFilterDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="c1">// Files modified after 06/01/2020 at 05:30 are allowed</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"modifiedAfter"</span><span class="o">,</span> <span class="s">"2020-06-01T05:30:00"</span><span class="o">)</span>
<span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">);</span>
<span class="nv">afterFilterDF</span><span class="o">.</span><span class="py">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">beforeFilterDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="c1">// Only load files modified before 7/1/2020 at 05:30</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"modifiedBefore"</span><span class="o">,</span> <span class="s">"2020-07-01T05:30:00"</span><span class="o">)</span>
<span class="c1">// Only load files modified after 6/1/2020 at 05:30</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"modifiedAfter"</span><span class="o">,</span> <span class="s">"2020-06-01T05:30:00"</span><span class="o">)</span>
<span class="c1">// Interpret both times above relative to CST timezone</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"timeZone"</span><span class="o">,</span> <span class="s">"CST"</span><span class="o">)</span>
<span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/dir1"</span><span class="o">);</span>
<span class="n">beforeFilterDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | file|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |file1.parquet|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">beforeDF</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">modifiedBefore</span><span class="o">=</span><span class="w"> </span><span class="s2">"2020-07-01T05:30:00"</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span><span class="w">
</span><span class="c1"># 1 file1.parquet</span><span class="w">
</span><span class="n">afterDF</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/dir1"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">modifiedAfter</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"2020-06-01T05:30:00"</span><span class="p">)</span><span class="w">
</span><span class="c1"># file</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.5.0"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>