blob: 9e0fd76618e3bde6bd747fdb29f731b80d627863 [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en-US">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Reading and writing data files • Arrow R Package</title>
<!-- favicons --><link rel="icon" type="image/png" sizes="96x96" href="../favicon-96x96.png">
<link rel="icon" type="”image/svg+xml”" href="../favicon.svg">
<link rel="apple-touch-icon" sizes="180x180" href="../apple-touch-icon.png">
<link rel="icon" sizes="any" href="../favicon.ico">
<link rel="manifest" href="../site.webmanifest">
<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet">
<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><link href="../deps/font-awesome-6.5.2/css/all.min.css" rel="stylesheet">
<link href="../deps/font-awesome-6.5.2/css/v4-shims.min.css" rel="stylesheet">
<script src="../deps/headroom-0.11.0/headroom.min.js"></script><script src="../deps/headroom-0.11.0/jQuery.headroom.min.js"></script><script src="../deps/bootstrap-toc-1.0.1/bootstrap-toc.min.js"></script><script src="../deps/clipboard.js-2.0.11/clipboard.min.js"></script><script src="../deps/search-1.0.0/autocomplete.jquery.min.js"></script><script src="../deps/search-1.0.0/fuse.min.js"></script><script src="../deps/search-1.0.0/mark.min.js"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet">
<meta property="og:title" content="Reading and writing data files">
<meta name="description" content="Learn how to read and write CSV, Parquet, and Feather files with arrow
">
<meta property="og:description" content="Learn how to read and write CSV, Parquet, and Feather files with arrow
">
<meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png">
<meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text">
<!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --><!-- Kapa AI --><script async src="https://widget.kapa.ai/kapa-widget.bundle.js" data-website-id="9db461d5-ac77-4b3f-a5c5-75efa78339d2" data-project-name="Apache Arrow" data-project-color="#000000" data-project-logo="https://arrow.apache.org/img/arrow-logo_chevrons_white-txt_black-bg.png" data-modal-disclaimer="This is a custom LLM with access to all of [Arrow documentation](https://arrow.apache.org/docs/). If you want an R-specific answer, please mention this in your question." data-consent-required="true" data-user-analytics-cookie-enabled="false" data-consent-screen-disclaimer="By clicking &quot;I agree, let's chat&quot;, you consent to the use of the AI assistant in accordance with kapa.ai's [Privacy Policy](https://www.kapa.ai/content/privacy-policy). This service uses reCAPTCHA, which requires your consent to Google's [Privacy Policy](https://policies.google.com/privacy) and [Terms of Service](https://policies.google.com/terms). By proceeding, you explicitly agree to both kapa.ai's and Google's privacy policies."></script><!-- End Kapa AI -->
</head>
<body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">21.0.0.9000</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a class="nav-link" href="../articles/arrow.html">Get started</a></li>
<li class="nav-item"><a class="nav-link" href="../reference/index.html">Reference</a></li>
<li class="active nav-item dropdown">
<button class="nav-link dropdown-toggle" type="button" id="dropdown-articles" data-bs-toggle="dropdown" aria-expanded="false" aria-haspopup="true">Articles</button>
<ul class="dropdown-menu" aria-labelledby="dropdown-articles">
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Using the package</h6></li>
<li><a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a></li>
<li><a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a></li>
<li><a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a></li>
<li><a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a></li>
<li><a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a></li>
<li><a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6></li>
<li><a class="dropdown-item" href="../articles/data_objects.html">Data objects</a></li>
<li><a class="dropdown-item" href="../articles/data_types.html">Data types</a></li>
<li><a class="dropdown-item" href="../articles/metadata.html">Metadata</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Installation</h6></li>
<li><a class="dropdown-item" href="../articles/install.html">Installing on Linux</a></li>
<li><a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="../articles/index.html">More articles...</a></li>
</ul>
</li>
<li class="nav-item"><a class="nav-link" href="../news/index.html">Changelog</a></li>
</ul>
<form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="" autocomplete="off">
</form>
<ul class="navbar-nav">
<li class="nav-item"><a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="GitHub"><span class="fa fab fa-github fa-lg"></span></a></li>
</ul>
</div>
</div>
</nav><div class="container template-article">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<h1>Reading and writing data files</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/vignettes/read_write.Rmd" class="external-link"><code>vignettes/read_write.Rmd</code></a></small>
<div class="d-none name"><code>read_write.Rmd</code></div>
</div>
<p>The arrow package provides functions for reading single data files
into memory, in several common formats. By default, calling any of these
functions returns an R data frame. To return an Arrow Table, set
argument <code>as_data_frame = FALSE</code>.</p>
<ul>
<li>
<code><a href="../reference/read_parquet.html">read_parquet()</a></code>: read a file in Parquet format</li>
<li>
<code><a href="../reference/read_feather.html">read_feather()</a></code>: read a file in the Apache Arrow IPC
format (formerly called the Feather format)</li>
<li>
<code><a href="../reference/read_delim_arrow.html">read_delim_arrow()</a></code>: read a delimited text file (default
delimiter is comma)</li>
<li>
<code><a href="../reference/read_delim_arrow.html">read_csv_arrow()</a></code>: read a comma-separated values (CSV)
file</li>
<li>
<code><a href="../reference/read_delim_arrow.html">read_tsv_arrow()</a></code>: read a tab-separated values (TSV)
file</li>
<li>
<code><a href="../reference/read_json_arrow.html">read_json_arrow()</a></code>: read a JSON data file</li>
</ul>
<p>For writing data to single files, the arrow package provides the
following functions, which can be used with both R data frames and Arrow
Tables:</p>
<ul>
<li>
<code><a href="../reference/write_parquet.html">write_parquet()</a></code>: write a file in Parquet format</li>
<li>
<code><a href="../reference/write_feather.html">write_feather()</a></code>: write a file in Arrow IPC format</li>
<li>
<code><a href="../reference/write_csv_arrow.html">write_csv_arrow()</a></code>: write a file in CSV format</li>
</ul>
<p>All these functions can read and write files in the local filesystem
or to cloud storage. For more on cloud storage support in arrow, see the
<a href="./fs.html">cloud storage article</a>.</p>
<p>The arrow package also supports reading larger-than-memory single
data files, and reading and writing multi-file data sets. This enables
analysis and processing of larger-than-memory data, and provides the
ability to partition data into smaller chunks without loading the full
data into memory. For more information on this topic, see the <a href="./dataset.html">dataset article</a>.</p>
<div class="section level2">
<h2 id="parquet-format">Parquet format<a class="anchor" aria-label="anchor" href="#parquet-format"></a>
</h2>
<p><a href="https://parquet.apache.org/" class="external-link">Apache Parquet</a> is a popular
choice for storing analytics data; it is a binary format that is
optimized for reduced file sizes and fast read performance, especially
for column-based access patterns. The simplest way to read and write
Parquet data using arrow is with the <code><a href="../reference/read_parquet.html">read_parquet()</a></code> and
<code><a href="../reference/write_parquet.html">write_parquet()</a></code> functions. To illustrate this, we’ll write
the <code>starwars</code> data included in dplyr to a Parquet file, then
read it back in. First load the arrow and dplyr packages:</p>
<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/apache/arrow/" class="external-link">arrow</a></span>, warn.conflicts <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://dplyr.tidyverse.org" class="external-link">dplyr</a></span>, warn.conflicts <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
<p>Next we’ll write the data frame to a Parquet file located at
<code>file_path</code>:</p>
<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/write_parquet.html">write_parquet</a></span><span class="op">(</span><span class="va">starwars</span>, <span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<p>The size of a Parquet file is typically much smaller than the
corresponding CSV file would have been. This is in part due to the use
of file compression: by default, Parquet files written with the arrow
package use <a href="https://google.github.io/snappy/" class="external-link">Snappy
compression</a> but other options such as gzip are also supported. See
<code><a href="../reference/write_parquet.html">help("write_parquet", package = "arrow")</a></code> for more
information.</p>
<p>Having written the Parquet file, we now can read it with
<code><a href="../reference/read_parquet.html">read_parquet()</a></code>:</p>
<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/read_parquet.html">read_parquet</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 87 x 14</span></span></span>
<span><span class="co">## name height mass hair_color skin_color eye_color birth_year sex gender</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Sk~ 172 77 blond fair blue 19 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO 167 75 <span style="color: #BB0000;">NA</span> gold yellow 112 none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 96 32 <span style="color: #BB0000;">NA</span> white, bl~ red 33 none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth V~ 202 136 none white yellow 41.9 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Or~ 150 49 brown light brown 19 fema~ femin~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen La~ 178 120 brown, gr~ light blue 52 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Wh~ 165 75 brown light blue 47 fema~ femin~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 97 32 <span style="color: #BB0000;">NA</span> white, red red <span style="color: #BB0000;">NA</span> none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs D~ 183 84 black light brown 24 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan~ 182 77 auburn, w~ fair blue-gray 57 male mascu~</span></span>
<span><span class="co">## <span style="color: #949494;"># i 77 more rows</span></span></span>
<span><span class="co">## <span style="color: #949494;"># i 5 more variables: homeworld &lt;chr&gt;, species &lt;chr&gt;, films &lt;list&lt;character&gt;&gt;,</span></span></span>
<span><span class="co">## <span style="color: #949494;"># vehicles &lt;list&lt;character&gt;&gt;, starships &lt;list&lt;character&gt;&gt;</span></span></span></code></pre>
<p>The default is to return a data frame or tibble. If we want an Arrow
Table instead, we would set <code>as_data_frame = FALSE</code>:</p>
<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/read_parquet.html">read_parquet</a></span><span class="op">(</span><span class="va">file_path</span>, as_data_frame <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Table</span></span>
<span><span class="co">## 87 rows x 14 columns</span></span>
<span><span class="co">## $name &lt;string&gt;</span></span>
<span><span class="co">## $height &lt;int32&gt;</span></span>
<span><span class="co">## $mass &lt;double&gt;</span></span>
<span><span class="co">## $hair_color &lt;string&gt;</span></span>
<span><span class="co">## $skin_color &lt;string&gt;</span></span>
<span><span class="co">## $eye_color &lt;string&gt;</span></span>
<span><span class="co">## $birth_year &lt;double&gt;</span></span>
<span><span class="co">## $sex &lt;string&gt;</span></span>
<span><span class="co">## $gender &lt;string&gt;</span></span>
<span><span class="co">## $homeworld &lt;string&gt;</span></span>
<span><span class="co">## $species &lt;string&gt;</span></span>
<span><span class="co">## $films: list&lt;element &lt;string&gt;&gt;</span></span>
<span><span class="co">## $vehicles: list&lt;element &lt;string&gt;&gt;</span></span>
<span><span class="co">## $starships: list&lt;element &lt;string&gt;&gt;</span></span></code></pre>
<p>One useful feature of Parquet files is that they store data
column-wise, and contain metadata that allow file readers to skip to the
relevant sections of the file. That means it is possible to load only a
subset of the columns without reading the complete file. The
<code>col_select</code> argument to <code><a href="../reference/read_parquet.html">read_parquet()</a></code> supports
this functionality:</p>
<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/read_parquet.html">read_parquet</a></span><span class="op">(</span><span class="va">file_path</span>, col_select <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"name"</span>, <span class="st">"height"</span>, <span class="st">"mass"</span><span class="op">)</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 87 x 3</span></span></span>
<span><span class="co">## name height mass</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Skywalker 172 77</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO 167 75</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 96 32</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth Vader 202 136</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Organa 150 49</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen Lars 178 120</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Whitesun Lars 165 75</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 97 32</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs Darklighter 183 84</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan Kenobi 182 77</span></span>
<span><span class="co">## <span style="color: #949494;"># i 77 more rows</span></span></span></code></pre>
<p>Fine-grained control over the Parquet reader is possible with the
<code>props</code> argument. See
<code><a href="../reference/ParquetArrowReaderProperties.html">help("ParquetArrowReaderProperties", package = "arrow")</a></code> for
details.</p>
<p>R object attributes are preserved when writing data to Parquet or
Arrow/Feather files and when reading those files back into R. This
enables round-trip writing and reading of <code>sf::sf</code> objects, R
data frames with with <code>haven::labelled</code> columns, and data
frame with other custom attributes. To learn more about how metadata are
handled in arrow, the <a href="./metadata.html">metadata
article</a>.</p>
</div>
<div class="section level2">
<h2 id="arrowfeather-format">Arrow/Feather format<a class="anchor" aria-label="anchor" href="#arrowfeather-format"></a>
</h2>
<p>The Arrow file format was developed to provide binary columnar
serialization for data frames, to make reading and writing data frames
efficient, and to make sharing data across data analysis languages easy.
This file format is sometimes referred to as Feather because it is an
outgrowth of the original <a href="https://github.com/wesm/feather" class="external-link">Feather</a> project that has now
been moved into the Arrow project itself. You can find the detailed
specification of version 2 of the Arrow format – officially referred to
as <a href="https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format" class="external-link">the
Arrow IPC file format</a> – on the Arrow specification page.</p>
<p>The <code><a href="../reference/write_feather.html">write_feather()</a></code> function writes version 2
Arrow/Feather files by default, and supports multiple kinds of file
compression. Basic use is shown below:</p>
<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/write_feather.html">write_feather</a></span><span class="op">(</span><span class="va">starwars</span>, <span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<p>The <code><a href="../reference/read_feather.html">read_feather()</a></code> function provides a familiar
interface for reading feather files:</p>
<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/read_feather.html">read_feather</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 87 x 14</span></span></span>
<span><span class="co">## name height mass hair_color skin_color eye_color birth_year sex gender</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Sk~ 172 77 blond fair blue 19 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO 167 75 <span style="color: #BB0000;">NA</span> gold yellow 112 none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 96 32 <span style="color: #BB0000;">NA</span> white, bl~ red 33 none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth V~ 202 136 none white yellow 41.9 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Or~ 150 49 brown light brown 19 fema~ femin~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen La~ 178 120 brown, gr~ light blue 52 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Wh~ 165 75 brown light blue 47 fema~ femin~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 97 32 <span style="color: #BB0000;">NA</span> white, red red <span style="color: #BB0000;">NA</span> none mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs D~ 183 84 black light brown 24 male mascu~</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan~ 182 77 auburn, w~ fair blue-gray 57 male mascu~</span></span>
<span><span class="co">## <span style="color: #949494;"># i 77 more rows</span></span></span>
<span><span class="co">## <span style="color: #949494;"># i 5 more variables: homeworld &lt;chr&gt;, species &lt;chr&gt;, films &lt;list&lt;character&gt;&gt;,</span></span></span>
<span><span class="co">## <span style="color: #949494;"># vehicles &lt;list&lt;character&gt;&gt;, starships &lt;list&lt;character&gt;&gt;</span></span></span></code></pre>
<p>Like the Parquet reader, this reader supports reading a only subset
of columns, and can produce Arrow Table output:</p>
<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/read_feather.html">read_feather</a></span><span class="op">(</span></span>
<span> file <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> col_select <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"name"</span>, <span class="st">"height"</span>, <span class="st">"mass"</span><span class="op">)</span>,</span>
<span> as_data_frame <span class="op">=</span> <span class="cn">FALSE</span></span>
<span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Table</span></span>
<span><span class="co">## 87 rows x 3 columns</span></span>
<span><span class="co">## $name &lt;string&gt;</span></span>
<span><span class="co">## $height &lt;int32&gt;</span></span>
<span><span class="co">## $mass &lt;double&gt;</span></span></code></pre>
</div>
<div class="section level2">
<h2 id="csv-format">CSV format<a class="anchor" aria-label="anchor" href="#csv-format"></a>
</h2>
<p>The read/write capabilities of the arrow package also include support
for CSV and other text-delimited files. The
<code><a href="../reference/read_delim_arrow.html">read_csv_arrow()</a></code>, <code><a href="../reference/read_delim_arrow.html">read_tsv_arrow()</a></code>, and
<code><a href="../reference/read_delim_arrow.html">read_delim_arrow()</a></code> functions all use the Arrow C++ CSV
reader to read data files, where the Arrow C++ options have been mapped
to arguments in a way that mirrors the conventions used in
<code>readr::read_delim()</code>, with a <code>col_select</code>
argument inspired by <code>vroom::vroom()</code>.</p>
<p>A simple example of writing and reading a CSV file with arrow is
shown below:</p>
<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/write_csv_arrow.html">write_csv_arrow</a></span><span class="op">(</span><span class="va">mtcars</span>, <span class="va">file_path</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/read_delim_arrow.html">read_csv_arrow</a></span><span class="op">(</span><span class="va">file_path</span>, col_select <span class="op">=</span> <span class="fu"><a href="https://tidyselect.r-lib.org/reference/starts_with.html" class="external-link">starts_with</a></span><span class="op">(</span><span class="st">"d"</span><span class="op">)</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 32 x 2</span></span></span>
<span><span class="co">## disp drat</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> 160 3.9 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> 160 3.9 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> 108 3.85</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> 258 3.08</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> 360 3.15</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> 225 2.76</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> 360 3.21</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> 147. 3.69</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> 141. 3.92</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> 168. 3.92</span></span>
<span><span class="co">## <span style="color: #949494;"># i 22 more rows</span></span></span></code></pre>
<p>In addition to the options provided by the readr-style arguments
(<code>delim</code>, <code>quote</code>, <code>escape_double</code>,
<code>escape_backslash</code>, etc), you can use the <code>schema</code>
argument to specify column types: see <code><a href="../reference/schema.html">schema()</a></code> help for
details. There is also the option of using <code>parse_options</code>,
<code>convert_options</code>, and <code>read_options</code> to exercise
fine-grained control over the arrow csv reader: see
<code><a href="../reference/CsvReadOptions.html">help("CsvReadOptions", package = "arrow")</a></code> for details.</p>
</div>
<div class="section level2">
<h2 id="json-format">JSON format<a class="anchor" aria-label="anchor" href="#json-format"></a>
</h2>
<p>The arrow package supports reading (but not writing) of tabular data
from line-delimited JSON, using the <code><a href="../reference/read_json_arrow.html">read_json_arrow()</a></code>
function. A minimal example is shown below:</p>
<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/writeLines.html" class="external-link">writeLines</a></span><span class="op">(</span><span class="st">'</span></span>
<span><span class="st"> { "hello": 3.5, "world": false, "yo": "thing" }</span></span>
<span><span class="st"> { "hello": 3.25, "world": null }</span></span>
<span><span class="st"> { "hello": 0.0, "world": true, "yo": null }</span></span>
<span><span class="st"> '</span>, <span class="va">file_path</span>, useBytes <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/read_json_arrow.html">read_json_arrow</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 3 x 3</span></span></span>
<span><span class="co">## hello world yo </span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;lgl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;">1</span> 3.5 FALSE thing</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">2</span> 3.25 <span style="color: #BB0000;">NA</span> <span style="color: #BB0000;">NA</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">3</span> 0 TRUE <span style="color: #BB0000;">NA</span></span></span></code></pre>
</div>
<div class="section level2">
<h2 id="further-reading">Further reading<a class="anchor" aria-label="anchor" href="#further-reading"></a>
</h2>
<ul>
<li>To learn more about cloud storage, see the <a href="./fs.html">cloud
storage article</a>.</li>
<li>To learn more about multi-file datasets, see the <a href="./dataset.html">datasets article</a>.</li>
<li>The Apache Arrow R cookbook has chapters on <a href="https://arrow.apache.org/cookbook/r/reading-and-writing-data---single-files.html" class="external-link">reading
and writing single files</a> into memory and working with <a href="https://arrow.apache.org/cookbook/r/reading-and-writing-data---multiple-files.html" class="external-link">multi-file
datasets</a> stored on-disk.</li>
</ul>
</div>
</main><aside class="col-md-3"><nav id="toc" aria-label="Table of contents"><h2>On this page</h2>
</nav></aside>
</div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.1.3.</p>
</div>
</footer>
</div>
</body>
</html>