blob: 1d7075d50e7a51486aec6ffeb73d8fc7a48e985f [file] [log] [blame]
<!-- Generated by pkgdown: do not edit by hand -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Write a dataset — write_dataset • Arrow R Package</title>
<!-- jquery -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script>
<!-- Bootstrap -->
<link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script>
<!-- bootstrap-toc -->
<link rel="stylesheet" href="../bootstrap-toc.css">
<script src="../bootstrap-toc.js"></script>
<!-- Font Awesome icons -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous" />
<!-- clipboard.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script>
<!-- headroom.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script>
<!-- pkgdown -->
<link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script>
<script src="../extra.js"></script>
<meta property="og:title" content="Write a dataset — write_dataset" />
<meta property="og:description" content="This function allows you to write a dataset. By writing to more efficient
binary storage formats, and by specifying relevant partitioning, you can
make it much faster to read and query." />
<!-- mathjax -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-spy="scroll" data-target="#toc">
<div class="container template-reference-topic">
<header>
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">Arrow R Package</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">4.0.1</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="https://arrow.apache.org/">❯❯❯</a>
</li>
<li>
<a href="../articles/arrow.html">Get started</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="../articles/install.html">Installing the Arrow Package on Linux</a>
</li>
<li>
<a href="../articles/dataset.html">Working with Arrow Datasets and dplyr</a>
</li>
<li>
<a href="../articles/fs.html">Working with Cloud Storage (S3)</a>
</li>
<li>
<a href="../articles/python.html">Apache Arrow in Python and R with reticulate</a>
</li>
<li>
<a href="../articles/flight.html">Connecting to Flight RPC Servers</a>
</li>
<li>
<a href="../articles/developing.html">Arrow R Developer Guide</a>
</li>
</ul>
</li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Project docs
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="https://arrow.apache.org/docs/format/README.html">Specification</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/c_glib">C GLib</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/cpp">C++</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/java">Java</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/js">JavaScript</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/python">Python</a>
</li>
<li>
<a href="../index.html">R</a>
</li>
</ul>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header>
<div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>Write a dataset</h1>
<small class="dont-index">Source: <a href='https://github.com/apache/arrow/blob/master/r/R/dataset-write.R'><code>R/dataset-write.R</code></a></small>
<div class="hidden name"><code>write_dataset.Rd</code></div>
</div>
<div class="ref-description">
<p>This function allows you to write a dataset. By writing to more efficient
binary storage formats, and by specifying relevant partitioning, you can
make it much faster to read and query.</p>
</div>
<pre class="usage"><span class='fu'>write_dataset</span><span class='op'>(</span>
<span class='va'>dataset</span>,
<span class='va'>path</span>,
format <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span><span class='op'>(</span><span class='st'>"parquet"</span>, <span class='st'>"feather"</span>, <span class='st'>"arrow"</span>, <span class='st'>"ipc"</span><span class='op'>)</span>,
partitioning <span class='op'>=</span> <span class='fu'>dplyr</span><span class='fu'>::</span><span class='fu'><a href='https://dplyr.tidyverse.org/reference/group_data.html'>group_vars</a></span><span class='op'>(</span><span class='va'>dataset</span><span class='op'>)</span>,
basename_template <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/paste.html'>paste0</a></span><span class='op'>(</span><span class='st'>"part-{i}."</span>, <span class='fu'><a href='https://rdrr.io/r/base/character.html'>as.character</a></span><span class='op'>(</span><span class='va'>format</span><span class='op'>)</span><span class='op'>)</span>,
hive_style <span class='op'>=</span> <span class='cn'>TRUE</span>,
<span class='va'>...</span>
<span class='op'>)</span></pre>
<h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
<table class="ref-arguments">
<colgroup><col class="name" /><col class="desc" /></colgroup>
<tr>
<th>dataset</th>
<td><p><a href='Dataset.html'>Dataset</a>, <a href='RecordBatch.html'>RecordBatch</a>, <a href='Table.html'>Table</a>, <code>arrow_dplyr_query</code>, or
<code>data.frame</code>. If an <code>arrow_dplyr_query</code> or <code>grouped_df</code>,
<code>schema</code> and <code>partitioning</code> will be taken from the result of any <code><a href='https://dplyr.tidyverse.org/reference/select.html'>select()</a></code>
and <code><a href='https://dplyr.tidyverse.org/reference/group_by.html'>group_by()</a></code> operations done on the dataset. <code><a href='https://dplyr.tidyverse.org/reference/filter.html'>filter()</a></code> queries will be
applied to restrict written rows.
Note that <code><a href='https://dplyr.tidyverse.org/reference/select.html'>select()</a></code>-ed columns may not be renamed.</p></td>
</tr>
<tr>
<th>path</th>
<td><p>string path, URI, or <code>SubTreeFileSystem</code> referencing a directory
to write to (directory will be created if it does not exist)</p></td>
</tr>
<tr>
<th>format</th>
<td><p>a string identifier of the file format. Default is to use
"parquet" (see <a href='FileFormat.html'>FileFormat</a>)</p></td>
</tr>
<tr>
<th>partitioning</th>
<td><p><code>Partitioning</code> or a character vector of columns to
use as partition keys (to be written as path segments). Default is to
use the current <code><a href='https://dplyr.tidyverse.org/reference/group_by.html'>group_by()</a></code> columns.</p></td>
</tr>
<tr>
<th>basename_template</th>
<td><p>string template for the names of files to be written.
Must contain <code>"{i}"</code>, which will be replaced with an autoincremented
integer to generate basenames of datafiles. For example, <code>"part-{i}.feather"</code>
will yield <code>"part-0.feather", ...</code>.</p></td>
</tr>
<tr>
<th>hive_style</th>
<td><p>logical: write partition segments as Hive-style
(<code>key1=value1/key2=value2/file.ext</code>) or as just bare values. Default is <code>TRUE</code>.</p></td>
</tr>
<tr>
<th>...</th>
<td><p>additional format-specific arguments. For available Parquet
options, see <code><a href='write_parquet.html'>write_parquet()</a></code>. The available Feather options are</p><ul>
<li><p><code>use_legacy_format</code> logical: write data formatted so that Arrow libraries
versions 0.14 and lower can read it. Default is <code>FALSE</code>. You can also
enable this by setting the environment variable <code>ARROW_PRE_0_15_IPC_FORMAT=1</code>.</p></li>
<li><p><code>metadata_version</code>: A string like "V5" or the equivalent integer indicating
the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
unless the environment variable <code>ARROW_PRE_1_0_METADATA_VERSION=1</code>, in
which case it will be V4.</p></li>
<li><p><code>codec</code>: A <a href='Codec.html'>Codec</a> which will be used to compress body buffers of written
files. Default (NULL) will not compress body buffers.</p></li>
<li><p><code>null_fallback</code>: character to be used in place of missing values (<code>NA</code> or
<code>NULL</code>) when using Hive-style partitioning. See <code><a href='hive_partition.html'>hive_partition()</a></code>.</p></li>
</ul></td>
</tr>
</table>
<h2 class="hasAnchor" id="value"><a class="anchor" href="#value"></a>Value</h2>
<p>The input <code>dataset</code>, invisibly</p>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
<nav id="toc" data-toggle="toc" class="sticky-top">
<h2 data-toc-skip>Contents</h2>
</nav>
</div>
</div>
<footer>
<div class="copyright">
<p>Developed by Neal Richardson, Ian Cook, Jonathan Keane, Romain François, Jeroen Ooms, Apache Arrow.</p>
</div>
<div class="pkgdown">
<p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.6.1.</p>
</div>
</footer>
</div>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body>
</html>