blob: 16a4de76ff5c122fb15c18961b2bb64468998257 [file] [log] [blame]
<!-- Generated by pkgdown: do not edit by hand -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Read a CSV or other delimited file with Arrow — read_delim_arrow • Arrow R Package</title>
<!-- jquery -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script>
<!-- Bootstrap -->
<link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script>
<!-- bootstrap-toc -->
<link rel="stylesheet" href="../bootstrap-toc.css">
<script src="../bootstrap-toc.js"></script>
<!-- Font Awesome icons -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous" />
<!-- clipboard.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script>
<!-- headroom.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script>
<!-- pkgdown -->
<link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script>
<script src="../extra.js"></script>
<meta property="og:title" content="Read a CSV or other delimited file with Arrow — read_delim_arrow" />
<meta property="og:description" content="These functions uses the Arrow C++ CSV reader to read into a data.frame.
Arrow C++ options have been mapped to argument names that follow those of
readr::read_delim(), and col_select was inspired by vroom::vroom()." />
<!-- mathjax -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-spy="scroll" data-target="#toc">
<div class="container template-reference-topic">
<header>
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">Arrow R Package</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">4.0.1</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="https://arrow.apache.org/">❯❯❯</a>
</li>
<li>
<a href="../articles/arrow.html">Get started</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="../articles/install.html">Installing the Arrow Package on Linux</a>
</li>
<li>
<a href="../articles/dataset.html">Working with Arrow Datasets and dplyr</a>
</li>
<li>
<a href="../articles/fs.html">Working with Cloud Storage (S3)</a>
</li>
<li>
<a href="../articles/python.html">Apache Arrow in Python and R with reticulate</a>
</li>
<li>
<a href="../articles/flight.html">Connecting to Flight RPC Servers</a>
</li>
<li>
<a href="../articles/developing.html">Arrow R Developer Guide</a>
</li>
</ul>
</li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Project docs
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="https://arrow.apache.org/docs/format/README.html">Specification</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/c_glib">C GLib</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/cpp">C++</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/java">Java</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/js">JavaScript</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/python">Python</a>
</li>
<li>
<a href="../index.html">R</a>
</li>
</ul>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header>
<div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>Read a CSV or other delimited file with Arrow</h1>
<small class="dont-index">Source: <a href='https://github.com/apache/arrow/blob/master/r/R/csv.R'><code>R/csv.R</code></a></small>
<div class="hidden name"><code>read_delim_arrow.Rd</code></div>
</div>
<div class="ref-description">
<p>These functions uses the Arrow C++ CSV reader to read into a <code>data.frame</code>.
Arrow C++ options have been mapped to argument names that follow those of
<code>readr::read_delim()</code>, and <code>col_select</code> was inspired by <code>vroom::vroom()</code>.</p>
</div>
<pre class="usage"><span class='fu'>read_delim_arrow</span><span class='op'>(</span>
<span class='va'>file</span>,
delim <span class='op'>=</span> <span class='st'>","</span>,
quote <span class='op'>=</span> <span class='st'>"\""</span>,
escape_double <span class='op'>=</span> <span class='cn'>TRUE</span>,
escape_backslash <span class='op'>=</span> <span class='cn'>FALSE</span>,
schema <span class='op'>=</span> <span class='cn'>NULL</span>,
col_names <span class='op'>=</span> <span class='cn'>TRUE</span>,
col_types <span class='op'>=</span> <span class='cn'>NULL</span>,
col_select <span class='op'>=</span> <span class='cn'>NULL</span>,
na <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span><span class='op'>(</span><span class='st'>""</span>, <span class='st'>"NA"</span><span class='op'>)</span>,
quoted_na <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip_empty_rows <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip <span class='op'>=</span> <span class='fl'>0L</span>,
parse_options <span class='op'>=</span> <span class='cn'>NULL</span>,
convert_options <span class='op'>=</span> <span class='cn'>NULL</span>,
read_options <span class='op'>=</span> <span class='cn'>NULL</span>,
as_data_frame <span class='op'>=</span> <span class='cn'>TRUE</span>,
timestamp_parsers <span class='op'>=</span> <span class='cn'>NULL</span>
<span class='op'>)</span>
<span class='fu'>read_csv_arrow</span><span class='op'>(</span>
<span class='va'>file</span>,
quote <span class='op'>=</span> <span class='st'>"\""</span>,
escape_double <span class='op'>=</span> <span class='cn'>TRUE</span>,
escape_backslash <span class='op'>=</span> <span class='cn'>FALSE</span>,
schema <span class='op'>=</span> <span class='cn'>NULL</span>,
col_names <span class='op'>=</span> <span class='cn'>TRUE</span>,
col_types <span class='op'>=</span> <span class='cn'>NULL</span>,
col_select <span class='op'>=</span> <span class='cn'>NULL</span>,
na <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span><span class='op'>(</span><span class='st'>""</span>, <span class='st'>"NA"</span><span class='op'>)</span>,
quoted_na <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip_empty_rows <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip <span class='op'>=</span> <span class='fl'>0L</span>,
parse_options <span class='op'>=</span> <span class='cn'>NULL</span>,
convert_options <span class='op'>=</span> <span class='cn'>NULL</span>,
read_options <span class='op'>=</span> <span class='cn'>NULL</span>,
as_data_frame <span class='op'>=</span> <span class='cn'>TRUE</span>,
timestamp_parsers <span class='op'>=</span> <span class='cn'>NULL</span>
<span class='op'>)</span>
<span class='fu'>read_tsv_arrow</span><span class='op'>(</span>
<span class='va'>file</span>,
quote <span class='op'>=</span> <span class='st'>"\""</span>,
escape_double <span class='op'>=</span> <span class='cn'>TRUE</span>,
escape_backslash <span class='op'>=</span> <span class='cn'>FALSE</span>,
schema <span class='op'>=</span> <span class='cn'>NULL</span>,
col_names <span class='op'>=</span> <span class='cn'>TRUE</span>,
col_types <span class='op'>=</span> <span class='cn'>NULL</span>,
col_select <span class='op'>=</span> <span class='cn'>NULL</span>,
na <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span><span class='op'>(</span><span class='st'>""</span>, <span class='st'>"NA"</span><span class='op'>)</span>,
quoted_na <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip_empty_rows <span class='op'>=</span> <span class='cn'>TRUE</span>,
skip <span class='op'>=</span> <span class='fl'>0L</span>,
parse_options <span class='op'>=</span> <span class='cn'>NULL</span>,
convert_options <span class='op'>=</span> <span class='cn'>NULL</span>,
read_options <span class='op'>=</span> <span class='cn'>NULL</span>,
as_data_frame <span class='op'>=</span> <span class='cn'>TRUE</span>,
timestamp_parsers <span class='op'>=</span> <span class='cn'>NULL</span>
<span class='op'>)</span></pre>
<h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
<table class="ref-arguments">
<colgroup><col class="name" /><col class="desc" /></colgroup>
<tr>
<th>file</th>
<td><p>A character file name or URI, <code>raw</code> vector, an Arrow input stream,
or a <code>FileSystem</code> with path (<code>SubTreeFileSystem</code>).
If a file name, a memory-mapped Arrow <a href='InputStream.html'>InputStream</a> will be opened and
closed when finished; compression will be detected from the file extension
and handled automatically. If an input stream is provided, it will be left
open.</p></td>
</tr>
<tr>
<th>delim</th>
<td><p>Single character used to separate fields within a record.</p></td>
</tr>
<tr>
<th>quote</th>
<td><p>Single character used to quote strings.</p></td>
</tr>
<tr>
<th>escape_double</th>
<td><p>Does the file escape quotes by doubling them?
i.e. If this option is <code>TRUE</code>, the value <code>""""</code> represents
a single quote, <code>\"</code>.</p></td>
</tr>
<tr>
<th>escape_backslash</th>
<td><p>Does the file use backslashes to escape special
characters? This is more general than <code>escape_double</code> as backslashes
can be used to escape the delimiter character, the quote character, or
to add special characters like <code>\\n</code>.</p></td>
</tr>
<tr>
<th>schema</th>
<td><p><a href='Schema.html'>Schema</a> that describes the table. If provided, it will be
used to satisfy both <code>col_names</code> and <code>col_types</code>.</p></td>
</tr>
<tr>
<th>col_names</th>
<td><p>If <code>TRUE</code>, the first row of the input will be used as the
column names and will not be included in the data frame. If <code>FALSE</code>, column
names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
Alternatively, you can specify a character vector of column names.</p></td>
</tr>
<tr>
<th>col_types</th>
<td><p>A compact string representation of the column types, or
<code>NULL</code> (the default) to infer types from the data.</p></td>
</tr>
<tr>
<th>col_select</th>
<td><p>A character vector of column names to keep, as in the
"select" argument to <code>data.table::fread()</code>, or a
<a href='https://tidyselect.r-lib.org/reference/vars_select.html'>tidy selection specification</a>
of columns, as used in <code><a href='https://dplyr.tidyverse.org/reference/select.html'>dplyr::select()</a></code>.</p></td>
</tr>
<tr>
<th>na</th>
<td><p>A character vector of strings to interpret as missing values.</p></td>
</tr>
<tr>
<th>quoted_na</th>
<td><p>Should missing values inside quotes be treated as missing
values (the default) or strings. (Note that this is different from the
the Arrow C++ default for the corresponding convert option,
<code>strings_can_be_null</code>.)</p></td>
</tr>
<tr>
<th>skip_empty_rows</th>
<td><p>Should blank rows be ignored altogether? If
<code>TRUE</code>, blank rows will not be represented at all. If <code>FALSE</code>, they will be
filled with missings.</p></td>
</tr>
<tr>
<th>skip</th>
<td><p>Number of lines to skip before reading data.</p></td>
</tr>
<tr>
<th>parse_options</th>
<td><p>see <a href='CsvReadOptions.html'>file reader options</a>.
If given, this overrides any
parsing options provided in other arguments (e.g. <code>delim</code>, <code>quote</code>, etc.).</p></td>
</tr>
<tr>
<th>convert_options</th>
<td><p>see <a href='CsvReadOptions.html'>file reader options</a></p></td>
</tr>
<tr>
<th>read_options</th>
<td><p>see <a href='CsvReadOptions.html'>file reader options</a></p></td>
</tr>
<tr>
<th>as_data_frame</th>
<td><p>Should the function return a <code>data.frame</code> (default) or
an Arrow <a href='Table.html'>Table</a>?</p></td>
</tr>
<tr>
<th>timestamp_parsers</th>
<td><p>User-defined timestamp parsers. If more than one
parser is specified, the CSV conversion logic will try parsing values
starting from the beginning of this vector. Possible values are:</p><ul>
<li><p><code>NULL</code>: the default, which uses the ISO-8601 parser</p></li>
<li><p>a character vector of <a href='https://rdrr.io/r/base/strptime.html'>strptime</a> parse strings</p></li>
<li><p>a list of <a href='CsvReadOptions.html'>TimestampParser</a> objects</p></li>
</ul></td>
</tr>
</table>
<h2 class="hasAnchor" id="value"><a class="anchor" href="#value"></a>Value</h2>
<p>A <code>data.frame</code>, or a Table if <code>as_data_frame = FALSE</code>.</p>
<h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2>
<p><code>read_csv_arrow()</code> and <code>read_tsv_arrow()</code> are wrappers around
<code>read_delim_arrow()</code> that specify a delimiter.</p>
<p>Note that not all <code>readr</code> options are currently implemented here. Please file
an issue if you encounter one that <code>arrow</code> should support.</p>
<p>If you need to control Arrow-specific reader parameters that don't have an
equivalent in <code>readr::read_csv()</code>, you can either provide them in the
<code>parse_options</code>, <code>convert_options</code>, or <code>read_options</code> arguments, or you can
use <a href='CsvTableReader.html'>CsvTableReader</a> directly for lower-level access.</p>
<h2 class="hasAnchor" id="specifying-column-types-and-names"><a class="anchor" href="#specifying-column-types-and-names"></a>Specifying column types and names</h2>
<p>By default, the CSV reader will infer the column names and data types from the file, but there
are a few ways you can specify them directly.</p>
<p>One way is to provide an Arrow <a href='Schema.html'>Schema</a> in the <code>schema</code> argument,
which is an ordered map of column name to type.
When provided, it satisfies both the <code>col_names</code> and <code>col_types</code> arguments.
This is good if you know all of this information up front.</p>
<p>You can also pass a <code>Schema</code> to the <code>col_types</code> argument. If you do this,
column names will still be inferred from the file unless you also specify
<code>col_names</code>. In either case, the column names in the <code>Schema</code> must match the
data's column names, whether they are explicitly provided or inferred. That
said, this <code>Schema</code> does not have to reference all columns: those omitted
will have their types inferred.</p>
<p>Alternatively, you can declare column types by providing the compact string representation
that <code>readr</code> uses to the <code>col_types</code> argument. This means you provide a
single string, one character per column, where the characters map to Arrow
types analogously to the <code>readr</code> type mapping:</p><ul>
<li><p>"c": <code><a href='data-type.html'>utf8()</a></code></p></li>
<li><p>"i": <code><a href='data-type.html'>int32()</a></code></p></li>
<li><p>"n": <code><a href='data-type.html'>float64()</a></code></p></li>
<li><p>"d": <code><a href='data-type.html'>float64()</a></code></p></li>
<li><p>"l": <code><a href='data-type.html'>bool()</a></code></p></li>
<li><p>"f": <code><a href='dictionary.html'>dictionary()</a></code></p></li>
<li><p>"D": <code><a href='data-type.html'>date32()</a></code></p></li>
<li><p>"T": <code><a href='data-type.html'>time32()</a></code></p></li>
<li><p>"t": <code><a href='data-type.html'>timestamp()</a></code></p></li>
<li><p>"_": <code><a href='data-type.html'>null()</a></code></p></li>
<li><p>"-": <code><a href='data-type.html'>null()</a></code></p></li>
<li><p>"?": infer the type from the data</p></li>
</ul>
<p>If you use the compact string representation for <code>col_types</code>, you must also
specify <code>col_names</code>.</p>
<p>Regardless of how types are specified, all columns with a <code><a href='data-type.html'>null()</a></code> type will
be dropped.</p>
<p>Note that if you are specifying column names, whether by <code>schema</code> or
<code>col_names</code>, and the CSV file has a header row that would otherwise be used
to idenfity column names, you'll need to add <code>skip = 1</code> to skip that row.</p>
<h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
<pre class="examples"><div class='input'><span class='co'># \donttest{</span>
<span class='va'>tf</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/r/base/tempfile.html'>tempfile</a></span><span class='op'>(</span><span class='op'>)</span>
<span class='fu'><a href='https://rdrr.io/r/base/on.exit.html'>on.exit</a></span><span class='op'>(</span><span class='fu'><a href='https://rdrr.io/r/base/unlink.html'>unlink</a></span><span class='op'>(</span><span class='va'>tf</span><span class='op'>)</span><span class='op'>)</span>
<span class='fu'><a href='https://rdrr.io/r/utils/write.table.html'>write.csv</a></span><span class='op'>(</span><span class='va'>mtcars</span>, file <span class='op'>=</span> <span class='va'>tf</span><span class='op'>)</span>
<span class='va'>df</span> <span class='op'>&lt;-</span> <span class='fu'>read_csv_arrow</span><span class='op'>(</span><span class='va'>tf</span><span class='op'>)</span>
<span class='fu'><a href='https://rdrr.io/r/base/dim.html'>dim</a></span><span class='op'>(</span><span class='va'>df</span><span class='op'>)</span>
</div><div class='output co'>#&gt; [1] 32 12</div><div class='input'> <span class='co'># Can select columns</span>
<span class='va'>df</span> <span class='op'>&lt;-</span> <span class='fu'>read_csv_arrow</span><span class='op'>(</span><span class='va'>tf</span>, col_select <span class='op'>=</span> <span class='fu'><a href='https://tidyselect.r-lib.org/reference/starts_with.html'>starts_with</a></span><span class='op'>(</span><span class='st'>"d"</span><span class='op'>)</span><span class='op'>)</span>
<span class='co'># }</span>
</div></pre>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
<nav id="toc" data-toggle="toc" class="sticky-top">
<h2 data-toc-skip>Contents</h2>
</nav>
</div>
</div>
<footer>
<div class="copyright">
<p>Developed by Neal Richardson, Ian Cook, Jonathan Keane, Romain François, Jeroen Ooms, Apache Arrow.</p>
</div>
<div class="pkgdown">
<p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.6.1.</p>
</div>
</footer>
</div>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body>
</html>