blob: 753f17dd705f32ed5fa1c5f3505932d31106f237 [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en-US"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Open a multi-file dataset of CSV or other delimiter-separated format — open_delim_dataset • Arrow R Package</title><!-- favicons --><link rel="icon" type="image/png" sizes="96x96" href="../favicon-96x96.png"><link rel="icon" type="”image/svg+xml”" href="../favicon.svg"><link rel="apple-touch-icon" sizes="180x180" href="../apple-touch-icon.png"><link rel="icon" sizes="any" href="../favicon.ico"><link rel="manifest" href="../site.webmanifest"><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><link href="../deps/font-awesome-6.5.2/css/all.min.css" rel="stylesheet"><link href="../deps/font-awesome-6.5.2/css/v4-shims.min.css" rel="stylesheet"><script src="../deps/headroom-0.11.0/headroom.min.js"></script><script src="../deps/headroom-0.11.0/jQuery.headroom.min.js"></script><script src="../deps/bootstrap-toc-1.0.1/bootstrap-toc.min.js"></script><script src="../deps/clipboard.js-2.0.11/clipboard.min.js"></script><script src="../deps/search-1.0.0/autocomplete.jquery.min.js"></script><script src="../deps/search-1.0.0/fuse.min.js"></script><script src="../deps/search-1.0.0/mark.min.js"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet"><meta property="og:title" content="Open a multi-file dataset of CSV or other delimiter-separated format — open_delim_dataset"><meta name="description" content="A wrapper around open_dataset which explicitly includes parameters mirroring read_csv_arrow(),
read_delim_arrow(), and read_tsv_arrow() to allow for easy switching between functions
for opening single files and functions for opening datasets."><meta property="og:description" content="A wrapper around open_dataset which explicitly includes parameters mirroring read_csv_arrow(),
read_delim_arrow(), and read_tsv_arrow() to allow for easy switching between functions
for opening single files and functions for opening datasets."><meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png"><meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text"><!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --><!-- Kapa AI --><script async src="https://widget.kapa.ai/kapa-widget.bundle.js" data-website-id="9db461d5-ac77-4b3f-a5c5-75efa78339d2" data-project-name="Apache Arrow" data-project-color="#000000" data-project-logo="https://arrow.apache.org/img/arrow-logo_chevrons_white-txt_black-bg.png" data-modal-disclaimer="This is a custom LLM with access to all of [Arrow documentation](https://arrow.apache.org/docs/). If you want an R-specific answer, please mention this in your question." data-consent-required="true" data-user-analytics-cookie-enabled="false" data-consent-screen-disclaimer="By clicking &quot;I agree, let's chat&quot;, you consent to the use of the AI assistant in accordance with kapa.ai's [Privacy Policy](https://www.kapa.ai/content/privacy-policy). This service uses reCAPTCHA, which requires your consent to Google's [Privacy Policy](https://policies.google.com/privacy) and [Terms of Service](https://policies.google.com/terms). By proceeding, you explicitly agree to both kapa.ai's and Google's privacy policies."></script><!-- End Kapa AI --></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">22.0.0.9000</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto"><li class="nav-item"><a class="nav-link" href="../articles/arrow.html">Get started</a></li>
<li class="active nav-item"><a class="nav-link" href="../reference/index.html">Reference</a></li>
<li class="nav-item dropdown">
<button class="nav-link dropdown-toggle" type="button" id="dropdown-articles" data-bs-toggle="dropdown" aria-expanded="false" aria-haspopup="true">Articles</button>
<ul class="dropdown-menu" aria-labelledby="dropdown-articles"><li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Using the package</h6></li>
<li><a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a></li>
<li><a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a></li>
<li><a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a></li>
<li><a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a></li>
<li><a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a></li>
<li><a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6></li>
<li><a class="dropdown-item" href="../articles/data_objects.html">Data objects</a></li>
<li><a class="dropdown-item" href="../articles/data_types.html">Data types</a></li>
<li><a class="dropdown-item" href="../articles/metadata.html">Metadata</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Installation</h6></li>
<li><a class="dropdown-item" href="../articles/install.html">Installing on Linux</a></li>
<li><a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="../articles/index.html">More articles...</a></li>
</ul></li>
<li class="nav-item"><a class="nav-link" href="../news/index.html">Changelog</a></li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="" autocomplete="off"></form>
<ul class="navbar-nav"><li class="nav-item"><a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="GitHub"><span class="fa fab fa-github fa-lg"></span></a></li>
</ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<h1>Open a multi-file dataset of CSV or other delimiter-separated format</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/R/dataset.R" class="external-link"><code>R/dataset.R</code></a></small>
<div class="d-none name"><code>open_delim_dataset.Rd</code></div>
</div>
<div class="ref-description section level2">
<p>A wrapper around <a href="open_dataset.html">open_dataset</a> which explicitly includes parameters mirroring <code><a href="read_delim_arrow.html">read_csv_arrow()</a></code>,
<code><a href="read_delim_arrow.html">read_delim_arrow()</a></code>, and <code><a href="read_delim_arrow.html">read_tsv_arrow()</a></code> to allow for easy switching between functions
for opening single files and functions for opening datasets.</p>
</div>
<div class="section level2">
<h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">open_delim_dataset</span><span class="op">(</span></span>
<span> <span class="va">sources</span>,</span>
<span> schema <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> partitioning <span class="op">=</span> <span class="fu"><a href="hive_partition.html">hive_partition</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> hive_style <span class="op">=</span> <span class="cn">NA</span>,</span>
<span> unify_schemas <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> factory_options <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> delim <span class="op">=</span> <span class="st">","</span>,</span>
<span> quote <span class="op">=</span> <span class="st">"\""</span>,</span>
<span> escape_double <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> escape_backslash <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> col_names <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> col_types <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> na <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">""</span>, <span class="st">"NA"</span><span class="op">)</span>,</span>
<span> skip_empty_rows <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> skip <span class="op">=</span> <span class="fl">0L</span>,</span>
<span> convert_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> read_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> timestamp_parsers <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> quoted_na <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> parse_options <span class="op">=</span> <span class="cn">NULL</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="fu">open_csv_dataset</span><span class="op">(</span></span>
<span> <span class="va">sources</span>,</span>
<span> schema <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> partitioning <span class="op">=</span> <span class="fu"><a href="hive_partition.html">hive_partition</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> hive_style <span class="op">=</span> <span class="cn">NA</span>,</span>
<span> unify_schemas <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> factory_options <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> quote <span class="op">=</span> <span class="st">"\""</span>,</span>
<span> escape_double <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> escape_backslash <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> col_names <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> col_types <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> na <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">""</span>, <span class="st">"NA"</span><span class="op">)</span>,</span>
<span> skip_empty_rows <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> skip <span class="op">=</span> <span class="fl">0L</span>,</span>
<span> convert_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> read_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> timestamp_parsers <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> quoted_na <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> parse_options <span class="op">=</span> <span class="cn">NULL</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="fu">open_tsv_dataset</span><span class="op">(</span></span>
<span> <span class="va">sources</span>,</span>
<span> schema <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> partitioning <span class="op">=</span> <span class="fu"><a href="hive_partition.html">hive_partition</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> hive_style <span class="op">=</span> <span class="cn">NA</span>,</span>
<span> unify_schemas <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> factory_options <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> quote <span class="op">=</span> <span class="st">"\""</span>,</span>
<span> escape_double <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> escape_backslash <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> col_names <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> col_types <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> na <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">""</span>, <span class="st">"NA"</span><span class="op">)</span>,</span>
<span> skip_empty_rows <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> skip <span class="op">=</span> <span class="fl">0L</span>,</span>
<span> convert_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> read_options <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> timestamp_parsers <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> quoted_na <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> parse_options <span class="op">=</span> <span class="cn">NULL</span></span>
<span><span class="op">)</span></span></code></pre></div>
</div>
<div class="section level2">
<h2 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h2>
<dl><dt id="arg-sources">sources<a class="anchor" aria-label="anchor" href="#arg-sources"></a></dt>
<dd><p>One of:</p><ul><li><p>a string path or URI to a directory containing data files</p></li>
<li><p>a <a href="FileSystem.html">FileSystem</a> that references a directory containing data files
(such as what is returned by <code><a href="s3_bucket.html">s3_bucket()</a></code>)</p></li>
<li><p>a string path or URI to a single file</p></li>
<li><p>a character vector of paths or URIs to individual data files</p></li>
<li><p>a list of <code>Dataset</code> objects as created by this function</p></li>
<li><p>a list of <code>DatasetFactory</code> objects as created by <code><a href="dataset_factory.html">dataset_factory()</a></code>.</p></li>
</ul><p>When <code>sources</code> is a vector of file URIs, they must all use the same protocol
and point to files located in the same file system and having the same
format.</p></dd>
<dt id="arg-schema">schema<a class="anchor" aria-label="anchor" href="#arg-schema"></a></dt>
<dd><p><a href="Schema-class.html">Schema</a> for the <code>Dataset</code>. If <code>NULL</code> (the default), the schema
will be inferred from the data sources.</p></dd>
<dt id="arg-partitioning">partitioning<a class="anchor" aria-label="anchor" href="#arg-partitioning"></a></dt>
<dd><p>When <code>sources</code> is a directory path/URI, one of:</p><ul><li><p>a <code>Schema</code>, in which case the file paths relative to <code>sources</code> will be
parsed, and path segments will be matched with the schema fields.</p></li>
<li><p>a character vector that defines the field names corresponding to those
path segments (that is, you're providing the names that would correspond
to a <code>Schema</code> but the types will be autodetected)</p></li>
<li><p>a <code>Partitioning</code> or <code>PartitioningFactory</code>, such as returned
by <code><a href="hive_partition.html">hive_partition()</a></code></p></li>
<li><p><code>NULL</code> for no partitioning</p></li>
</ul><p>The default is to autodetect Hive-style partitions unless
<code>hive_style = FALSE</code>. See the "Partitioning" section for details.
When <code>sources</code> is not a directory path/URI, <code>partitioning</code> is ignored.</p></dd>
<dt id="arg-hive-style">hive_style<a class="anchor" aria-label="anchor" href="#arg-hive-style"></a></dt>
<dd><p>Logical: should <code>partitioning</code> be interpreted as
Hive-style? Default is <code>NA</code>, which means to inspect the file paths for
Hive-style partitioning and behave accordingly.</p></dd>
<dt id="arg-unify-schemas">unify_schemas<a class="anchor" aria-label="anchor" href="#arg-unify-schemas"></a></dt>
<dd><p>logical: should all data fragments (files, <code>Dataset</code>s)
be scanned in order to create a unified schema from them? If <code>FALSE</code>, only
the first fragment will be inspected for its schema. Use this fast path
when you know and trust that all fragments have an identical schema.
The default is <code>FALSE</code> when creating a dataset from a directory path/URI or
vector of file paths/URIs (because there may be many files and scanning may
be slow) but <code>TRUE</code> when <code>sources</code> is a list of <code>Dataset</code>s (because there
should be few <code>Dataset</code>s in the list and their <code>Schema</code>s are already in
memory).</p></dd>
<dt id="arg-factory-options">factory_options<a class="anchor" aria-label="anchor" href="#arg-factory-options"></a></dt>
<dd><p>list of optional FileSystemFactoryOptions:</p><ul><li><p><code>partition_base_dir</code>: string path segment prefix to ignore when
discovering partition information with DirectoryPartitioning. Not
meaningful (ignored with a warning) for HivePartitioning, nor is it
valid when providing a vector of file paths.</p></li>
<li><p><code>exclude_invalid_files</code>: logical: should files that are not valid data
files be excluded? Default is <code>FALSE</code> because checking all files up
front incurs I/O and thus will be slower, especially on remote
filesystems. If false and there are invalid files, there will be an
error at scan time. This is the only FileSystemFactoryOption that is
valid for both when providing a directory path in which to discover
files and when providing a vector of file paths.</p></li>
<li><p><code>selector_ignore_prefixes</code>: character vector of file prefixes to ignore
when discovering files in a directory. If invalid files can be excluded
by a common filename prefix this way, you can avoid the I/O cost of
<code>exclude_invalid_files</code>. Not valid when providing a vector of file paths
(but if you're providing the file list, you can filter invalid files
yourself).</p></li>
</ul></dd>
<dt id="arg-delim">delim<a class="anchor" aria-label="anchor" href="#arg-delim"></a></dt>
<dd><p>Single character used to separate fields within a record.</p></dd>
<dt id="arg-quote">quote<a class="anchor" aria-label="anchor" href="#arg-quote"></a></dt>
<dd><p>Single character used to quote strings.</p></dd>
<dt id="arg-escape-double">escape_double<a class="anchor" aria-label="anchor" href="#arg-escape-double"></a></dt>
<dd><p>Does the file escape quotes by doubling them?
i.e. If this option is <code>TRUE</code>, the value <code>""""</code> represents
a single quote, <code>\"</code>.</p></dd>
<dt id="arg-escape-backslash">escape_backslash<a class="anchor" aria-label="anchor" href="#arg-escape-backslash"></a></dt>
<dd><p>Does the file use backslashes to escape special
characters? This is more general than <code>escape_double</code> as backslashes
can be used to escape the delimiter character, the quote character, or
to add special characters like <code>\\n</code>.</p></dd>
<dt id="arg-col-names">col_names<a class="anchor" aria-label="anchor" href="#arg-col-names"></a></dt>
<dd><p>If <code>TRUE</code>, the first row of the input will be used as the
column names and will not be included in the data frame. If <code>FALSE</code>, column
names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
Alternatively, you can specify a character vector of column names.</p></dd>
<dt id="arg-col-types">col_types<a class="anchor" aria-label="anchor" href="#arg-col-types"></a></dt>
<dd><p>A compact string representation of the column types,
an Arrow <a href="Schema-class.html">Schema</a>, or <code>NULL</code> (the default) to infer types from the data.</p></dd>
<dt id="arg-na">na<a class="anchor" aria-label="anchor" href="#arg-na"></a></dt>
<dd><p>A character vector of strings to interpret as missing values.</p></dd>
<dt id="arg-skip-empty-rows">skip_empty_rows<a class="anchor" aria-label="anchor" href="#arg-skip-empty-rows"></a></dt>
<dd><p>Should blank rows be ignored altogether? If
<code>TRUE</code>, blank rows will not be represented at all. If <code>FALSE</code>, they will be
filled with missings.</p></dd>
<dt id="arg-skip">skip<a class="anchor" aria-label="anchor" href="#arg-skip"></a></dt>
<dd><p>Number of lines to skip before reading data.</p></dd>
<dt id="arg-convert-options">convert_options<a class="anchor" aria-label="anchor" href="#arg-convert-options"></a></dt>
<dd><p>see <a href="csv_convert_options.html">CSV conversion options</a></p></dd>
<dt id="arg-read-options">read_options<a class="anchor" aria-label="anchor" href="#arg-read-options"></a></dt>
<dd><p>see <a href="csv_read_options.html">CSV reading options</a></p></dd>
<dt id="arg-timestamp-parsers">timestamp_parsers<a class="anchor" aria-label="anchor" href="#arg-timestamp-parsers"></a></dt>
<dd><p>User-defined timestamp parsers. If more than one
parser is specified, the CSV conversion logic will try parsing values
starting from the beginning of this vector. Possible values are:</p><ul><li><p><code>NULL</code>: the default, which uses the ISO-8601 parser</p></li>
<li><p>a character vector of <a href="https://rdrr.io/r/base/strptime.html" class="external-link">strptime</a> parse strings</p></li>
<li><p>a list of <a href="CsvReadOptions.html">TimestampParser</a> objects</p></li>
</ul></dd>
<dt id="arg-quoted-na">quoted_na<a class="anchor" aria-label="anchor" href="#arg-quoted-na"></a></dt>
<dd><p>Should missing values inside quotes be treated as missing
values (the default) or strings. (Note that this is different from the
the Arrow C++ default for the corresponding convert option,
<code>strings_can_be_null</code>.)</p></dd>
<dt id="arg-parse-options">parse_options<a class="anchor" aria-label="anchor" href="#arg-parse-options"></a></dt>
<dd><p>see <a href="csv_parse_options.html">CSV parsing options</a>.
If given, this overrides any
parsing options provided in other arguments (e.g. <code>delim</code>, <code>quote</code>, etc.).</p></dd>
</dl></div>
<div class="section level2">
<h2 id="options-currently-supported-by-read-delim-arrow-which-are-not-supported-here">Options currently supported by <code><a href="read_delim_arrow.html">read_delim_arrow()</a></code> which are not supported here<a class="anchor" aria-label="anchor" href="#options-currently-supported-by-read-delim-arrow-which-are-not-supported-here"></a></h2>
<ul><li><p><code>file</code> (instead, please specify files in <code>sources</code>)</p></li>
<li><p><code>col_select</code> (instead, subset columns after dataset creation)</p></li>
<li><p><code>as_data_frame</code> (instead, convert to data frame after dataset creation)</p></li>
<li><p><code>parse_options</code></p></li>
</ul></div>
<div class="section level2">
<h2 id="see-also">See also<a class="anchor" aria-label="anchor" href="#see-also"></a></h2>
<div class="dont-index"><p><code><a href="open_dataset.html">open_dataset()</a></code></p></div>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="co"># Set up directory for examples</span></span></span>
<span class="r-in"><span><span class="va">tf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tf</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"1"</span>, <span class="st">"2"</span>, <span class="st">"NULL"</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">tf</span>, <span class="st">"file1.txt"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.table</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path</span>, sep <span class="op">=</span> <span class="st">","</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># Use readr-style params identically in both `read_csv_dataset()` and `open_csv_dataset()`</span></span></span>
<span class="r-in"><span><span class="fu"><a href="read_delim_arrow.html">read_csv_arrow</a></span><span class="op">(</span><span class="va">file_path</span>, na <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">""</span>, <span class="st">"NA"</span>, <span class="st">"NULL"</span><span class="op">)</span>, col_names <span class="op">=</span> <span class="st">"y"</span>, skip <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> <span style="color: #949494;"># A tibble: 3 x 1</span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> y</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> <span style="color: #BCBCBC;">1</span> 1</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> <span style="color: #BCBCBC;">2</span> 2</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> <span style="color: #BCBCBC;">3</span> <span style="color: #BB0000;">NA</span></span>
<span class="r-in"><span><span class="fu">open_csv_dataset</span><span class="op">(</span><span class="va">file_path</span>, na <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">""</span>, <span class="st">"NA"</span>, <span class="st">"NULL"</span><span class="op">)</span>, col_names <span class="op">=</span> <span class="st">"y"</span>, skip <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 1 csv file</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 1 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> y: int64</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># Use `col_types` to specify a schema, partial schema, or compact representation</span></span></span>
<span class="r-in"><span><span class="va">tf2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="write_delim_dataset.html">write_csv_dataset</a></span><span class="op">(</span><span class="va">cars</span>, <span class="va">tf2</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="fu">open_csv_dataset</span><span class="op">(</span><span class="va">tf2</span>, col_types <span class="op">=</span> <span class="fu"><a href="schema.html">schema</a></span><span class="op">(</span>speed <span class="op">=</span> <span class="fu"><a href="data-type.html">int32</a></span><span class="op">(</span><span class="op">)</span>, dist <span class="op">=</span> <span class="fu"><a href="data-type.html">int32</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 1 csv file</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 2 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> speed: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> dist: int32</span>
<span class="r-in"><span><span class="fu">open_csv_dataset</span><span class="op">(</span><span class="va">tf2</span>, col_types <span class="op">=</span> <span class="fu"><a href="schema.html">schema</a></span><span class="op">(</span>speed <span class="op">=</span> <span class="fu"><a href="data-type.html">int32</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 1 csv file</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 2 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> speed: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> dist: int64</span>
<span class="r-in"><span><span class="fu">open_csv_dataset</span><span class="op">(</span><span class="va">tf2</span>, col_types <span class="op">=</span> <span class="st">"ii"</span>, col_names <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"speed"</span>, <span class="st">"dist"</span><span class="op">)</span>, skip <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> FileSystemDataset with 1 csv file</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> 2 columns</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> speed: int32</span>
<span class="r-out co"><span class="r-pr">#&gt;</span> dist: int32</span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc" aria-label="Table of contents"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.1.3.</p>
</div>
</footer></div>
</body></html>