blob: 2ea531675072584fe832ef8cda5acd4f69ba777c [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en-US"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Open a multi-file dataset — open_dataset • Arrow R Package</title><!-- favicons --><link rel="icon" type="image/png" sizes="96x96" href="../favicon-96x96.png"><link rel="icon" type="”image/svg+xml”" href="../favicon.svg"><link rel="apple-touch-icon" sizes="180x180" href="../apple-touch-icon.png"><link rel="icon" sizes="any" href="../favicon.ico"><link rel="manifest" href="../site.webmanifest"><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><link href="../deps/font-awesome-6.5.2/css/all.min.css" rel="stylesheet"><link href="../deps/font-awesome-6.5.2/css/v4-shims.min.css" rel="stylesheet"><script src="../deps/headroom-0.11.0/headroom.min.js"></script><script src="../deps/headroom-0.11.0/jQuery.headroom.min.js"></script><script src="../deps/bootstrap-toc-1.0.1/bootstrap-toc.min.js"></script><script src="../deps/clipboard.js-2.0.11/clipboard.min.js"></script><script src="../deps/search-1.0.0/autocomplete.jquery.min.js"></script><script src="../deps/search-1.0.0/fuse.min.js"></script><script src="../deps/search-1.0.0/mark.min.js"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet"><meta property="og:title" content="Open a multi-file dataset — open_dataset"><meta name="description" content="Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files). Call
open_dataset() to point to a directory of data files and return a
Dataset, then use dplyr methods to query it."><meta property="og:description" content="Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files). Call
open_dataset() to point to a directory of data files and return a
Dataset, then use dplyr methods to query it."><meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png"><meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text"><!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --><!-- Kapa AI --><script async src="https://widget.kapa.ai/kapa-widget.bundle.js" data-website-id="9db461d5-ac77-4b3f-a5c5-75efa78339d2" data-project-name="Apache Arrow" data-project-color="#000000" data-project-logo="https://arrow.apache.org/img/arrow-logo_chevrons_white-txt_black-bg.png" data-modal-disclaimer="This is a custom LLM with access to all of [Arrow documentation](https://arrow.apache.org/docs/). If you want an R-specific answer, please mention this in your question." data-consent-required="true" data-user-analytics-cookie-enabled="false" data-consent-screen-disclaimer="By clicking &quot;I agree, let's chat&quot;, you consent to the use of the AI assistant in accordance with kapa.ai's [Privacy Policy](https://www.kapa.ai/content/privacy-policy). This service uses reCAPTCHA, which requires your consent to Google's [Privacy Policy](https://policies.google.com/privacy) and [Terms of Service](https://policies.google.com/terms). By proceeding, you explicitly agree to both kapa.ai's and Google's privacy policies."></script><!-- End Kapa AI --></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">22.0.0</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto"><li class="nav-item"><a class="nav-link" href="../articles/arrow.html">Get started</a></li>
<li class="active nav-item"><a class="nav-link" href="../reference/index.html">Reference</a></li>
<li class="nav-item dropdown">
<button class="nav-link dropdown-toggle" type="button" id="dropdown-articles" data-bs-toggle="dropdown" aria-expanded="false" aria-haspopup="true">Articles</button>
<ul class="dropdown-menu" aria-labelledby="dropdown-articles"><li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Using the package</h6></li>
<li><a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a></li>
<li><a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a></li>
<li><a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a></li>
<li><a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a></li>
<li><a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a></li>
<li><a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6></li>
<li><a class="dropdown-item" href="../articles/data_objects.html">Data objects</a></li>
<li><a class="dropdown-item" href="../articles/data_types.html">Data types</a></li>
<li><a class="dropdown-item" href="../articles/metadata.html">Metadata</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Installation</h6></li>
<li><a class="dropdown-item" href="../articles/install.html">Installing on Linux</a></li>
<li><a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="../articles/index.html">More articles...</a></li>
</ul></li>
<li class="nav-item"><a class="nav-link" href="../news/index.html">Changelog</a></li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="" autocomplete="off"></form>
<ul class="navbar-nav"><li class="nav-item"><a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="GitHub"><span class="fa fab fa-github fa-lg"></span></a></li>
</ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<h1>Open a multi-file dataset</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/R/dataset.R" class="external-link"><code>R/dataset.R</code></a></small>
<div class="d-none name"><code>open_dataset.Rd</code></div>
</div>
<div class="ref-description section level2">
<p>Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files). Call
<code>open_dataset()</code> to point to a directory of data files and return a
<code>Dataset</code>, then use <code>dplyr</code> methods to query it.</p>
</div>
<div class="section level2">
<h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">open_dataset</span><span class="op">(</span></span>
<span> <span class="va">sources</span>,</span>
<span> schema <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> partitioning <span class="op">=</span> <span class="fu"><a href="hive_partition.html">hive_partition</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> hive_style <span class="op">=</span> <span class="cn">NA</span>,</span>
<span> unify_schemas <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> format <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"parquet"</span>, <span class="st">"arrow"</span>, <span class="st">"ipc"</span>, <span class="st">"feather"</span>, <span class="st">"csv"</span>, <span class="st">"tsv"</span>, <span class="st">"text"</span>, <span class="st">"json"</span><span class="op">)</span>,</span>
<span> factory_options <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> <span class="va">...</span></span>
<span><span class="op">)</span></span></code></pre></div>
</div>
<div class="section level2">
<h2 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h2>
<dl><dt id="arg-sources">sources<a class="anchor" aria-label="anchor" href="#arg-sources"></a></dt>
<dd><p>One of:</p><ul><li><p>a string path or URI to a directory containing data files</p></li>
<li><p>a <a href="FileSystem.html">FileSystem</a> that references a directory containing data files
(such as what is returned by <code><a href="s3_bucket.html">s3_bucket()</a></code>)</p></li>
<li><p>a string path or URI to a single file</p></li>
<li><p>a character vector of paths or URIs to individual data files</p></li>
<li><p>a list of <code>Dataset</code> objects as created by this function</p></li>
<li><p>a list of <code>DatasetFactory</code> objects as created by <code><a href="dataset_factory.html">dataset_factory()</a></code>.</p></li>
</ul><p>When <code>sources</code> is a vector of file URIs, they must all use the same protocol
and point to files located in the same file system and having the same
format.</p></dd>
<dt id="arg-schema">schema<a class="anchor" aria-label="anchor" href="#arg-schema"></a></dt>
<dd><p><a href="Schema-class.html">Schema</a> for the <code>Dataset</code>. If <code>NULL</code> (the default), the schema
will be inferred from the data sources.</p></dd>
<dt id="arg-partitioning">partitioning<a class="anchor" aria-label="anchor" href="#arg-partitioning"></a></dt>
<dd><p>When <code>sources</code> is a directory path/URI, one of:</p><ul><li><p>a <code>Schema</code>, in which case the file paths relative to <code>sources</code> will be
parsed, and path segments will be matched with the schema fields.</p></li>
<li><p>a character vector that defines the field names corresponding to those
path segments (that is, you're providing the names that would correspond
to a <code>Schema</code> but the types will be autodetected)</p></li>
<li><p>a <code>Partitioning</code> or <code>PartitioningFactory</code>, such as returned
by <code><a href="hive_partition.html">hive_partition()</a></code></p></li>
<li><p><code>NULL</code> for no partitioning</p></li>
</ul><p>The default is to autodetect Hive-style partitions unless
<code>hive_style = FALSE</code>. See the "Partitioning" section for details.
When <code>sources</code> is not a directory path/URI, <code>partitioning</code> is ignored.</p></dd>
<dt id="arg-hive-style">hive_style<a class="anchor" aria-label="anchor" href="#arg-hive-style"></a></dt>
<dd><p>Logical: should <code>partitioning</code> be interpreted as
Hive-style? Default is <code>NA</code>, which means to inspect the file paths for
Hive-style partitioning and behave accordingly.</p></dd>
<dt id="arg-unify-schemas">unify_schemas<a class="anchor" aria-label="anchor" href="#arg-unify-schemas"></a></dt>
<dd><p>logical: should all data fragments (files, <code>Dataset</code>s)
be scanned in order to create a unified schema from them? If <code>FALSE</code>, only
the first fragment will be inspected for its schema. Use this fast path
when you know and trust that all fragments have an identical schema.
The default is <code>FALSE</code> when creating a dataset from a directory path/URI or
vector of file paths/URIs (because there may be many files and scanning may
be slow) but <code>TRUE</code> when <code>sources</code> is a list of <code>Dataset</code>s (because there
should be few <code>Dataset</code>s in the list and their <code>Schema</code>s are already in
memory).</p></dd>
<dt id="arg-format">format<a class="anchor" aria-label="anchor" href="#arg-format"></a></dt>
<dd><p>A <a href="FileFormat.html">FileFormat</a> object, or a string identifier of the format of
the files in <code>x</code>. This argument is ignored when <code>sources</code> is a list of <code>Dataset</code> objects.
Currently supported values:</p><ul><li><p>"parquet"</p></li>
<li><p>"ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
only version 2 files are supported</p></li>
<li><p>"csv"/"text", aliases for the same thing (because comma is the default
delimiter for text files</p></li>
<li><p>"tsv", equivalent to passing <code>format = "text", delimiter = "\t"</code></p></li>
<li><p>"json", for JSON format datasets Note: only newline-delimited JSON (aka ND-JSON) datasets
are currently supported
Default is "parquet", unless a <code>delimiter</code> is also specified, in which case
it is assumed to be "text".</p></li>
</ul></dd>
<dt id="arg-factory-options">factory_options<a class="anchor" aria-label="anchor" href="#arg-factory-options"></a></dt>
<dd><p>list of optional FileSystemFactoryOptions:</p><ul><li><p><code>partition_base_dir</code>: string path segment prefix to ignore when
discovering partition information with DirectoryPartitioning. Not
meaningful (ignored with a warning) for HivePartitioning, nor is it
valid when providing a vector of file paths.</p></li>
<li><p><code>exclude_invalid_files</code>: logical: should files that are not valid data
files be excluded? Default is <code>FALSE</code> because checking all files up
front incurs I/O and thus will be slower, especially on remote
filesystems. If false and there are invalid files, there will be an
error at scan time. This is the only FileSystemFactoryOption that is
valid for both when providing a directory path in which to discover
files and when providing a vector of file paths.</p></li>
<li><p><code>selector_ignore_prefixes</code>: character vector of file prefixes to ignore
when discovering files in a directory. If invalid files can be excluded
by a common filename prefix this way, you can avoid the I/O cost of
<code>exclude_invalid_files</code>. Not valid when providing a vector of file paths
(but if you're providing the file list, you can filter invalid files
yourself).</p></li>
</ul></dd>
<dt id="arg--">...<a class="anchor" aria-label="anchor" href="#arg--"></a></dt>
<dd><p>additional arguments passed to <code><a href="dataset_factory.html">dataset_factory()</a></code> when <code>sources</code>
is a directory path/URI or vector of file paths/URIs, otherwise ignored.
These may include <code>format</code> to indicate the file format, or other
format-specific options (see <code><a href="read_delim_arrow.html">read_csv_arrow()</a></code>, <code><a href="read_parquet.html">read_parquet()</a></code> and <code><a href="read_feather.html">read_feather()</a></code> on how to specify these).</p></dd>
</dl></div>
<div class="section level2">
<h2 id="value">Value<a class="anchor" aria-label="anchor" href="#value"></a></h2>
<p>A <a href="Dataset.html">Dataset</a> R6 object. Use <code>dplyr</code> methods on it to query the data,
or call <code><a href="Scanner.html">$NewScan()</a></code> to construct a query directly.</p>
</div>
<div class="section level2">
<h2 id="partitioning">Partitioning<a class="anchor" aria-label="anchor" href="#partitioning"></a></h2>
<p>Data is often split into multiple files and nested in subdirectories based on the value of one or more
columns in the data. It may be a column that is commonly referenced in
queries, or it may be time-based, for some examples. Data that is divided
this way is "partitioned," and the values for those partitioning columns are
encoded into the file path segments.
These path segments are effectively virtual columns in the dataset, and
because their values are known prior to reading the files themselves, we can
greatly speed up filtered queries by skipping some files entirely.</p>
<p>Arrow supports reading partition information from file paths in two forms:</p><ul><li><p>"Hive-style", deriving from the Apache Hive project and common to some
database systems. Partitions are encoded as "key=value" in path segments,
such as <code>"year=2019/month=1/file.parquet"</code>. While they may be awkward as
file names, they have the advantage of being self-describing.</p></li>
<li><p>"Directory" partitioning, which is Hive without the key names, like
<code>"2019/01/file.parquet"</code>. In order to use these, we need know at least
what names to give the virtual columns that come from the path segments.</p></li>
</ul><p>The default behavior in <code>open_dataset()</code> is to inspect the file paths
contained in the provided directory, and if they look like Hive-style, parse
them as Hive. If your dataset has Hive-style partitioning in the file paths,
you do not need to provide anything in the <code>partitioning</code> argument to
<code>open_dataset()</code> to use them. If you do provide a character vector of
partition column names, they will be ignored if they match what is detected,
and if they don't match, you'll get an error. (If you want to rename
partition columns, do that using <code><a href="https://dplyr.tidyverse.org/reference/select.html" class="external-link">select()</a></code> or <code><a href="https://dplyr.tidyverse.org/reference/rename.html" class="external-link">rename()</a></code> after opening the
dataset.). If you provide a <code>Schema</code> and the names match what is detected,
it will use the types defined by the Schema. In the example file path above,
you could provide a Schema to specify that "month" should be <code><a href="data-type.html">int8()</a></code>
instead of the <code><a href="data-type.html">int32()</a></code> it will be parsed as by default.</p>
<p>If your file paths do not appear to be Hive-style, or if you pass
<code>hive_style = FALSE</code>, the <code>partitioning</code> argument will be used to create
Directory partitioning. A character vector of names is required to create
partitions; you may instead provide a <code>Schema</code> to map those names to desired
column types, as described above. If neither are provided, no partitioning
information will be taken from the file paths.</p>
</div>
<div class="section level2">
<h2 id="see-also">See also<a class="anchor" aria-label="anchor" href="#see-also"></a></h2>
<div class="dont-index"><p><a href="https://arrow.apache.org/docs/r/articles/dataset.html">
datasets article</a></p></div>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="co"># Set up directory for examples</span></span></span>
<span class="r-in"><span><span class="va">tf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tf</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="fu"><a href="write_dataset.html">write_dataset</a></span><span class="op">(</span><span class="va">mtcars</span>, <span class="va">tf</span>, partitioning <span class="op">=</span> <span class="st">"cyl"</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># You can specify a directory containing the files for your dataset and</span></span></span>
<span class="r-in"><span><span class="co"># open_dataset will scan all files in your directory.</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 3 Parquet files</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 11 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> mpg: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> disp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> hp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> drat: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> wt: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> qsec: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> vs: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> am: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> gear: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> carb: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> cyl: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># You can also supply a vector of paths</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">tf</span>, <span class="st">"cyl=4/part-0.parquet"</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">tf</span>, <span class="st">"cyl=8/part-0.parquet"</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 2 Parquet files</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 10 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> mpg: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> disp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> hp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> drat: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> wt: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> qsec: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> vs: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> am: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> gear: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> carb: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co">## You must specify the file format if using a format other than parquet.</span></span></span>
<span class="r-in"><span><span class="va">tf2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tf2</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="write_dataset.html">write_dataset</a></span><span class="op">(</span><span class="va">mtcars</span>, <span class="va">tf2</span>, format <span class="op">=</span> <span class="st">"ipc"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># This line will results in errors when you try to work with the data</span></span></span>
<span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="cn">FALSE</span><span class="op">)</span> <span class="op">{</span> <span class="co"># \dontrun{</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf2</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span> <span class="co"># }</span></span></span>
<span class="r-in"><span><span class="co"># This line will work</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf2</span>, format <span class="op">=</span> <span class="st">"ipc"</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 1 Feather file</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 11 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> mpg: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> cyl: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> disp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> hp: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> drat: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> wt: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> qsec: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> vs: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> am: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> gear: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> carb: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co">## You can specify file partitioning to include it as a field in your dataset</span></span></span>
<span class="r-in"><span><span class="co"># Create a temporary directory and write example dataset</span></span></span>
<span class="r-in"><span><span class="va">tf3</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tf3</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="write_dataset.html">write_dataset</a></span><span class="op">(</span><span class="va">airquality</span>, <span class="va">tf3</span>, partitioning <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"Month"</span>, <span class="st">"Day"</span><span class="op">)</span>, hive_style <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># View files - you can see the partitioning means that files have been written</span></span></span>
<span class="r-in"><span><span class="co"># to folders based on Month/Day values</span></span></span>
<span class="r-in"><span><span class="va">tf3_files</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.files.html" class="external-link">list.files</a></span><span class="op">(</span><span class="va">tf3</span>, recursive <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># With no partitioning specified, dataset contains all files but doesn't include</span></span></span>
<span class="r-in"><span><span class="co"># directory names as field names</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf3</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 153 Parquet files</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 4 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Ozone: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Solar.R: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Wind: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Temp: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># Now that partitioning has been specified, your dataset contains columns for Month and Day</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf3</span>, partitioning <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"Month"</span>, <span class="st">"Day"</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 153 Parquet files</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 6 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Ozone: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Solar.R: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Wind: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Temp: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Month: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Day: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># If you want to specify the data types for your fields, you can pass in a Schema</span></span></span>
<span class="r-in"><span><span class="fu">open_dataset</span><span class="op">(</span><span class="va">tf3</span>, partitioning <span class="op">=</span> <span class="fu"><a href="schema.html">schema</a></span><span class="op">(</span>Month <span class="op">=</span> <span class="fu"><a href="data-type.html">int8</a></span><span class="op">(</span><span class="op">)</span>, Day <span class="op">=</span> <span class="fu"><a href="data-type.html">int8</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 153 Parquet files</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 6 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Ozone: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Solar.R: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Wind: double</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Temp: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Month: int8</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> Day: int8</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> </span>
<span class="r-out co"><span class="r-pr">#&gt;</span> See $metadata for additional Schema metadata</span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc" aria-label="Table of contents"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.1.3.</p>
</div>
</footer></div>
</body></html>