blob: 7d7b66527e2c8fa84cd4deb374693ed69a7deb6c [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en-US"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Write Parquet file to disk — write_parquet • Arrow R Package</title><!-- favicons --><link rel="icon" type="image/png" sizes="96x96" href="../favicon-96x96.png"><link rel="icon" type="”image/svg+xml”" href="../favicon.svg"><link rel="apple-touch-icon" sizes="180x180" href="../apple-touch-icon.png"><link rel="icon" sizes="any" href="../favicon.ico"><link rel="manifest" href="../site.webmanifest"><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><link href="../deps/font-awesome-6.5.2/css/all.min.css" rel="stylesheet"><link href="../deps/font-awesome-6.5.2/css/v4-shims.min.css" rel="stylesheet"><script src="../deps/headroom-0.11.0/headroom.min.js"></script><script src="../deps/headroom-0.11.0/jQuery.headroom.min.js"></script><script src="../deps/bootstrap-toc-1.0.1/bootstrap-toc.min.js"></script><script src="../deps/clipboard.js-2.0.11/clipboard.min.js"></script><script src="../deps/search-1.0.0/autocomplete.jquery.min.js"></script><script src="../deps/search-1.0.0/fuse.min.js"></script><script src="../deps/search-1.0.0/mark.min.js"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet"><meta property="og:title" content="Write Parquet file to disk — write_parquet"><meta name="description" content="Parquet is a columnar storage file format.
This function enables you to write Parquet files from R."><meta property="og:description" content="Parquet is a columnar storage file format.
This function enables you to write Parquet files from R."><meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png"><meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text"><!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --><!-- Kapa AI --><script async src="https://widget.kapa.ai/kapa-widget.bundle.js" data-website-id="9db461d5-ac77-4b3f-a5c5-75efa78339d2" data-project-name="Apache Arrow" data-project-color="#000000" data-project-logo="https://arrow.apache.org/img/arrow-logo_chevrons_white-txt_black-bg.png" data-modal-disclaimer="This is a custom LLM with access to all of [Arrow documentation](https://arrow.apache.org/docs/). If you want an R-specific answer, please mention this in your question." data-consent-required="true" data-user-analytics-cookie-enabled="false" data-consent-screen-disclaimer="By clicking &quot;I agree, let's chat&quot;, you consent to the use of the AI assistant in accordance with kapa.ai's [Privacy Policy](https://www.kapa.ai/content/privacy-policy). This service uses reCAPTCHA, which requires your consent to Google's [Privacy Policy](https://policies.google.com/privacy) and [Terms of Service](https://policies.google.com/terms). By proceeding, you explicitly agree to both kapa.ai's and Google's privacy policies."></script><!-- End Kapa AI --></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">22.0.0</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto"><li class="nav-item"><a class="nav-link" href="../articles/arrow.html">Get started</a></li>
<li class="active nav-item"><a class="nav-link" href="../reference/index.html">Reference</a></li>
<li class="nav-item dropdown">
<button class="nav-link dropdown-toggle" type="button" id="dropdown-articles" data-bs-toggle="dropdown" aria-expanded="false" aria-haspopup="true">Articles</button>
<ul class="dropdown-menu" aria-labelledby="dropdown-articles"><li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Using the package</h6></li>
<li><a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a></li>
<li><a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a></li>
<li><a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a></li>
<li><a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a></li>
<li><a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a></li>
<li><a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6></li>
<li><a class="dropdown-item" href="../articles/data_objects.html">Data objects</a></li>
<li><a class="dropdown-item" href="../articles/data_types.html">Data types</a></li>
<li><a class="dropdown-item" href="../articles/metadata.html">Metadata</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Installation</h6></li>
<li><a class="dropdown-item" href="../articles/install.html">Installing on Linux</a></li>
<li><a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="../articles/index.html">More articles...</a></li>
</ul></li>
<li class="nav-item"><a class="nav-link" href="../news/index.html">Changelog</a></li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="" autocomplete="off"></form>
<ul class="navbar-nav"><li class="nav-item"><a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="GitHub"><span class="fa fab fa-github fa-lg"></span></a></li>
</ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<h1>Write Parquet file to disk</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/R/parquet.R" class="external-link"><code>R/parquet.R</code></a></small>
<div class="d-none name"><code>write_parquet.Rd</code></div>
</div>
<div class="ref-description section level2">
<p><a href="https://parquet.apache.org/" class="external-link">Parquet</a> is a columnar storage file format.
This function enables you to write Parquet files from R.</p>
</div>
<div class="section level2">
<h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">write_parquet</span><span class="op">(</span></span>
<span> <span class="va">x</span>,</span>
<span> <span class="va">sink</span>,</span>
<span> chunk_size <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> version <span class="op">=</span> <span class="st">"2.4"</span>,</span>
<span> compression <span class="op">=</span> <span class="fu">default_parquet_compression</span><span class="op">(</span><span class="op">)</span>,</span>
<span> compression_level <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> use_dictionary <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> write_statistics <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> data_page_size <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> use_deprecated_int96_timestamps <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> coerce_timestamps <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> allow_truncated_timestamps <span class="op">=</span> <span class="cn">FALSE</span></span>
<span><span class="op">)</span></span></code></pre></div>
</div>
<div class="section level2">
<h2 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h2>
<dl><dt id="arg-x">x<a class="anchor" aria-label="anchor" href="#arg-x"></a></dt>
<dd><p><code>data.frame</code>, <a href="RecordBatch-class.html">RecordBatch</a>, or <a href="Table-class.html">Table</a></p></dd>
<dt id="arg-sink">sink<a class="anchor" aria-label="anchor" href="#arg-sink"></a></dt>
<dd><p>A string file path, connection, URI, or <a href="OutputStream.html">OutputStream</a>, or path in a file
system (<code>SubTreeFileSystem</code>)</p></dd>
<dt id="arg-chunk-size">chunk_size<a class="anchor" aria-label="anchor" href="#arg-chunk-size"></a></dt>
<dd><p>how many rows of data to write to disk at once. This
directly corresponds to how many rows will be in each row group in
parquet. If <code>NULL</code>, a best guess will be made for optimal size (based on
the number of columns and number of rows), though if the data has fewer
than 250 million cells (rows x cols), then the total number of rows is
used.</p></dd>
<dt id="arg-version">version<a class="anchor" aria-label="anchor" href="#arg-version"></a></dt>
<dd><p>parquet version: "1.0", "2.4" (default), "2.6", or
"latest" (currently equivalent to 2.6). Numeric values are
coerced to character.</p></dd>
<dt id="arg-compression">compression<a class="anchor" aria-label="anchor" href="#arg-compression"></a></dt>
<dd><p>compression algorithm. Default "snappy". See details.</p></dd>
<dt id="arg-compression-level">compression_level<a class="anchor" aria-label="anchor" href="#arg-compression-level"></a></dt>
<dd><p>compression level. Meaning depends on compression
algorithm</p></dd>
<dt id="arg-use-dictionary">use_dictionary<a class="anchor" aria-label="anchor" href="#arg-use-dictionary"></a></dt>
<dd><p>logical: use dictionary encoding? Default <code>TRUE</code></p></dd>
<dt id="arg-write-statistics">write_statistics<a class="anchor" aria-label="anchor" href="#arg-write-statistics"></a></dt>
<dd><p>logical: include statistics? Default <code>TRUE</code></p></dd>
<dt id="arg-data-page-size">data_page_size<a class="anchor" aria-label="anchor" href="#arg-data-page-size"></a></dt>
<dd><p>Set a target threshold for the approximate encoded
size of data pages within a column chunk (in bytes). Default 1 MiB.</p></dd>
<dt id="arg-use-deprecated-int-timestamps">use_deprecated_int96_timestamps<a class="anchor" aria-label="anchor" href="#arg-use-deprecated-int-timestamps"></a></dt>
<dd><p>logical: write timestamps to INT96
Parquet format, which has been deprecated? Default <code>FALSE</code>.</p></dd>
<dt id="arg-coerce-timestamps">coerce_timestamps<a class="anchor" aria-label="anchor" href="#arg-coerce-timestamps"></a></dt>
<dd><p>Cast timestamps a particular resolution. Can be
<code>NULL</code>, "ms" or "us". Default <code>NULL</code> (no casting)</p></dd>
<dt id="arg-allow-truncated-timestamps">allow_truncated_timestamps<a class="anchor" aria-label="anchor" href="#arg-allow-truncated-timestamps"></a></dt>
<dd><p>logical: Allow loss of data when coercing
timestamps to a particular resolution. E.g. if microsecond or nanosecond
data is lost when coercing to "ms", do not raise an exception. Default
<code>FALSE</code>.</p></dd>
</dl></div>
<div class="section level2">
<h2 id="value">Value<a class="anchor" aria-label="anchor" href="#value"></a></h2>
<p>the input <code>x</code> invisibly.</p>
</div>
<div class="section level2">
<h2 id="details">Details<a class="anchor" aria-label="anchor" href="#details"></a></h2>
<p>Due to features of the format, Parquet files cannot be appended to.
If you want to use the Parquet format but also want the ability to extend
your dataset, you can write to additional Parquet files and then treat
the whole directory of files as a <a href="Dataset.html">Dataset</a> you can query.
See the <a href="https://arrow.apache.org/docs/r/articles/dataset.html">dataset
article</a> for examples of this.</p>
<p>The parameters <code>compression</code>, <code>compression_level</code>, <code>use_dictionary</code> and
<code>write_statistics</code> support various patterns:</p><ul><li><p>The default <code>NULL</code> leaves the parameter unspecified, and the C++ library
uses an appropriate default for each column (defaults listed above)</p></li>
<li><p>A single, unnamed, value (e.g. a single string for <code>compression</code>) applies to all columns</p></li>
<li><p>An unnamed vector, of the same size as the number of columns, to specify a
value for each column, in positional order</p></li>
<li><p>A named vector, to specify the value for the named columns, the default
value for the setting is used when not supplied</p></li>
</ul><p>The <code>compression</code> argument can be any of the following (case-insensitive):
"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
are almost always included. See <code><a href="codec_is_available.html">codec_is_available()</a></code>.
The default "snappy" is used if available, otherwise "uncompressed". To
disable compression, set <code>compression = "uncompressed"</code>.
Note that "uncompressed" columns may still have dictionary encoding.</p>
</div>
<div class="section level2">
<h2 id="see-also">See also<a class="anchor" aria-label="anchor" href="#see-also"></a></h2>
<div class="dont-index"><p><a href="ParquetFileWriter.html">ParquetFileWriter</a> for a lower-level interface to Parquet writing.</p></div>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="va">tf1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".parquet"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu">write_parquet</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">5</span><span class="op">)</span>, <span class="va">tf1</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># using compression</span></span></span>
<span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="fu"><a href="codec_is_available.html">codec_is_available</a></span><span class="op">(</span><span class="st">"gzip"</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span></span>
<span class="r-in"><span> <span class="va">tf2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".gz.parquet"</span><span class="op">)</span></span></span>
<span class="r-in"><span> <span class="fu">write_parquet</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">5</span><span class="op">)</span>, <span class="va">tf2</span>, compression <span class="op">=</span> <span class="st">"gzip"</span>, compression_level <span class="op">=</span> <span class="fl">5</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span></span></span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc" aria-label="Table of contents"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.1.3.</p>
</div>
</footer></div>
</body></html>