blob: 83fba1dadcf979d53b985172c4dbf116df3e5d9d [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="Learn how to use the dplyr backend supplied by arrow
">
<title>Data analysis with dplyr syntax • Arrow R Package</title>
<!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png">
<link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png">
<link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png">
<link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png">
<link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png">
<link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png">
<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet">
<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
<!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js" integrity="sha512-7O5pXpc0oCRrxk8RUfDYFgn0nO1t+jLuIOQdOMRp4APB7uZ4vSjspzp5y6YDtDs4VzUSTbWzBFZ/LKJhnyFOKw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet">
<meta property="og:title" content="Data analysis with dplyr syntax">
<meta property="og:description" content="Learn how to use the dplyr backend supplied by arrow
">
<meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png">
<meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text">
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:creator" content="@apachearrow">
<meta name="twitter:site" content="@apachearrow">
<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--><!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code -->
</head>
<body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">16.0.0.9000</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link" href="../articles/arrow.html">Get started</a>
</li>
<li class="nav-item">
<a class="nav-link" href="../reference/index.html">Reference</a>
</li>
<li class="active nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-articles">Articles</a>
<div class="dropdown-menu" aria-labelledby="dropdown-articles">
<h6 class="dropdown-header" data-toc-skip>Using the package</h6>
<a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a>
<a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a>
<a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a>
<a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a>
<a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a>
<a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a>
<div class="dropdown-divider"></div>
<h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6>
<a class="dropdown-item" href="../articles/data_objects.html">Data objects</a>
<a class="dropdown-item" href="../articles/data_types.html">Data types</a>
<a class="dropdown-item" href="../articles/metadata.html">Metadata</a>
<div class="dropdown-divider"></div>
<h6 class="dropdown-header" data-toc-skip>Installation</h6>
<a class="dropdown-item" href="../articles/install.html">Installing on Linux</a>
<a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="../articles/index.html">More articles...</a>
</div>
</li>
<li class="nav-item">
<a class="nav-link" href="../news/index.html">Changelog</a>
</li>
</ul>
<form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off">
</form>
<ul class="navbar-nav">
<li class="nav-item">
<a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="github">
<span class="fab fa fab fa-github fa-lg"></span>
</a>
</li>
</ul>
</div>
</div>
</nav><div class="container template-article">
<script src="data_wrangling_files/accessible-code-block-0.0.1/empty-anchor.js"></script><div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<img src="" class="logo" alt=""><h1>Data analysis with dplyr syntax</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/vignettes/data_wrangling.Rmd" class="external-link"><code>vignettes/data_wrangling.Rmd</code></a></small>
<div class="d-none name"><code>data_wrangling.Rmd</code></div>
</div>
<p>The arrow package provides functionality allowing users to manipulate tabular Arrow data (<code>Table</code> and <code>Dataset</code> objects) with familiar <a href="https://dplyr.tidyverse.org" class="external-link">dplyr</a> syntax. To enable this functionality, ensure that the arrow and dplyr packages are both loaded. In this article we will take the <code>starwars</code> data set included in dplyr, convert it to an Arrow Table, and then analyze this data. Note that, although these examples all use an in-memory <code>Table</code> object, the same functionality works for an on-disk <code>Dataset</code> object with only minor differences in behavior (documented later in the article).</p>
<p>To get started let’s load the packages and create the data:</p>
<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://dplyr.tidyverse.org" class="external-link">dplyr</a></span>, warn.conflicts <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/apache/arrow/" class="external-link">arrow</a></span>, warn.conflicts <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span></span>
<span><span class="va">sw</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/table.html">arrow_table</a></span><span class="op">(</span><span class="va">starwars</span>, as_data_frame <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
<div class="section level2">
<h2 id="one-table-dplyr-verbs">One-table dplyr verbs<a class="anchor" aria-label="anchor" href="#one-table-dplyr-verbs"></a>
</h2>
<p>The arrow package provides support for the dplyr one-table verbs, allowing users to construct data analysis pipelines in a familiar way. The example below shows the use of <code><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter()</a></code>, <code><a href="https://dplyr.tidyverse.org/reference/rename.html" class="external-link">rename()</a></code>, <code><a href="https://dplyr.tidyverse.org/reference/mutate.html" class="external-link">mutate()</a></code>, <code><a href="https://dplyr.tidyverse.org/reference/arrange.html" class="external-link">arrange()</a></code> and <code><a href="https://dplyr.tidyverse.org/reference/select.html" class="external-link">select()</a></code>:</p>
<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">result</span> <span class="op">&lt;-</span> <span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">homeworld</span> <span class="op">==</span> <span class="st">"Tatooine"</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/rename.html" class="external-link">rename</a></span><span class="op">(</span>height_cm <span class="op">=</span> <span class="va">height</span>, mass_kg <span class="op">=</span> <span class="va">mass</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html" class="external-link">mutate</a></span><span class="op">(</span>height_in <span class="op">=</span> <span class="va">height_cm</span> <span class="op">/</span> <span class="fl">2.54</span>, mass_lbs <span class="op">=</span> <span class="va">mass_kg</span> <span class="op">*</span> <span class="fl">2.2046</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html" class="external-link">arrange</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html" class="external-link">desc</a></span><span class="op">(</span><span class="va">birth_year</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html" class="external-link">select</a></span><span class="op">(</span><span class="va">name</span>, <span class="va">height_in</span>, <span class="va">mass_lbs</span><span class="op">)</span></span></code></pre></div>
<p>It is important to note that arrow uses lazy evaluation to delay computation until the result is explicitly requested. This speeds up processing by enabling the Arrow C++ library to perform multiple computations in one operation. As a consequence of this design choice, we have not yet performed computations on the <code>sw</code> data. The <code>result</code> variable is an object with class <code>arrow_dplyr_query</code> that represents all the computations to be performed:</p>
<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">result</span></span></code></pre></div>
<pre><code><span><span class="co">## Table (query)</span></span>
<span><span class="co">## name: string</span></span>
<span><span class="co">## height_in: double (divide(cast(height, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), cast(2.54, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false})))</span></span>
<span><span class="co">## mass_lbs: double (multiply_checked(mass, 2.2046))</span></span>
<span><span class="co">## </span></span>
<span><span class="co">## * Filter: (homeworld == "Tatooine")</span></span>
<span><span class="co">## * Sorted by birth_year [desc]</span></span>
<span><span class="co">## See $.data for the source Arrow object</span></span></code></pre>
<p>To perform these computations and materialize the result, we call <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">compute()</a></code> or <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect()</a></code>. The difference between the two determines what kind of object will be returned. Calling <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">compute()</a></code> returns an Arrow Table, suitable for passing to other arrow or dplyr functions:</p>
<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">compute</a></span><span class="op">(</span><span class="va">result</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Table</span></span>
<span><span class="co">## 10 rows x 3 columns</span></span>
<span><span class="co">## $name &lt;string&gt;</span></span>
<span><span class="co">## $height_in &lt;double&gt;</span></span>
<span><span class="co">## $mass_lbs &lt;double&gt;</span></span></code></pre>
<p>In contrast, <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect()</a></code> returns an R data frame, suitable for viewing or passing to other R functions for analysis or visualization:</p>
<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="va">result</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 10 x 3</span></span></span>
<span><span class="co">## name height_in mass_lbs</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> C-3PO 65.7 165. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> Cliegg Lars 72.0 <span style="color: #BB0000;">NA</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> Shmi Skywalker 64.2 <span style="color: #BB0000;">NA</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Owen Lars 70.1 265. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Beru Whitesun Lars 65.0 165. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Darth Vader 79.5 300. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Anakin Skywalker 74.0 185. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> Biggs Darklighter 72.0 185. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Luke Skywalker 67.7 170. </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> R5-D4 38.2 70.5</span></span></code></pre>
<p>The arrow package has broad support for single-table dplyr verbs, including those that compute aggregates. For example, it supports <code><a href="https://dplyr.tidyverse.org/reference/group_by.html" class="external-link">group_by()</a></code> and <code><a href="https://dplyr.tidyverse.org/reference/summarise.html" class="external-link">summarize()</a></code>, as well as commonly-used convenience functions such as <code><a href="https://dplyr.tidyverse.org/reference/count.html" class="external-link">count()</a></code>:</p>
<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html" class="external-link">group_by</a></span><span class="op">(</span><span class="va">species</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html" class="external-link">summarize</a></span><span class="op">(</span>mean_height <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html" class="external-link">mean</a></span><span class="op">(</span><span class="va">height</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 38 x 2</span></span></span>
<span><span class="co">## species mean_height</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Human 178 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> Droid 131.</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> Wookiee 231 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Rodian 173 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Hutt 175 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> <span style="color: #BB0000;">NA</span> 175 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Yoda's species 66 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> Trandoshan 190 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Mon Calamari 180 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Ewok 88 </span></span>
<span><span class="co">## <span style="color: #949494;"># i 28 more rows</span></span></span></code></pre>
<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html" class="external-link">count</a></span><span class="op">(</span><span class="va">gender</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 3 x 2</span></span></span>
<span><span class="co">## gender n</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;">1</span> masculine 66</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">2</span> feminine 17</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">3</span> <span style="color: #BB0000;">NA</span> 4</span></span></code></pre>
<p>Note, however, that window functions such as <code><a href="https://dplyr.tidyverse.org/reference/ntile.html" class="external-link">ntile()</a></code> are not yet supported.</p>
</div>
<div class="section level2">
<h2 id="two-table-dplyr-verbs">Two-table dplyr verbs<a class="anchor" aria-label="anchor" href="#two-table-dplyr-verbs"></a>
</h2>
<p>Equality joins (e.g. <code><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html" class="external-link">left_join()</a></code>, <code><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html" class="external-link">inner_join()</a></code>) are supported for joining multiple tables. This is illustrated below:</p>
<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">jedi</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span></span>
<span> name <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"C-3PO"</span>, <span class="st">"Luke Skywalker"</span>, <span class="st">"Obi-Wan Kenobi"</span><span class="op">)</span>,</span>
<span> jedi <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="cn">FALSE</span>, <span class="cn">TRUE</span>, <span class="cn">TRUE</span><span class="op">)</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html" class="external-link">select</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html" class="external-link">right_join</a></span><span class="op">(</span><span class="va">jedi</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 3 x 4</span></span></span>
<span><span class="co">## name height mass jedi </span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;lgl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;">1</span> Luke Skywalker 172 77 TRUE </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">2</span> C-3PO 167 75 FALSE</span></span>
<span><span class="co">## <span style="color: #BCBCBC;">3</span> Obi-Wan Kenobi 182 77 TRUE</span></span></code></pre>
</div>
<div class="section level2">
<h2 id="expressions-within-dplyr-verbs">Expressions within dplyr verbs<a class="anchor" aria-label="anchor" href="#expressions-within-dplyr-verbs"></a>
</h2>
<p>Inside dplyr verbs, Arrow offers support for many functions and operators, with common functions mapped to their base R and tidyverse equivalents: you can find a <a href="../reference/acero.html">list of supported functions within dplyr queries</a> in the function documentation. If there are additional functions you would like to see implemented, please file an issue as described in the <a href="https://arrow.apache.org/docs/r/#getting-help">Getting help</a> guidelines.</p>
</div>
<div class="section level2">
<h2 id="registering-custom-bindings">Registering custom bindings<a class="anchor" aria-label="anchor" href="#registering-custom-bindings"></a>
</h2>
<p>The arrow package makes it possible for users to supply bindings for custom functions in some situations using <code><a href="../reference/register_scalar_function.html">register_scalar_function()</a></code>. To operate correctly, the to-be-registered function must have <code>context</code> as its first argument, as required by the query engine. For example, suppose we wanted to implement a function that converts a string to snake case (a greatly simplified version of <code>janitor::make_clean_names()</code>). The function could be written as follows:</p>
<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">to_snake_name</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">context</span>, <span class="va">string</span><span class="op">)</span> <span class="op">{</span></span>
<span> <span class="va">replace</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span>`'` <span class="op">=</span> <span class="st">""</span>, `"` <span class="op">=</span> <span class="st">""</span>, `-` <span class="op">=</span> <span class="st">""</span>, `\\.` <span class="op">=</span> <span class="st">"_"</span>, ` ` <span class="op">=</span> <span class="st">"_"</span><span class="op">)</span></span>
<span> <span class="va">string</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu">stringr</span><span class="fu">::</span><span class="fu"><a href="https://stringr.tidyverse.org/reference/str_replace.html" class="external-link">str_replace_all</a></span><span class="op">(</span><span class="va">replace</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu">stringr</span><span class="fu">::</span><span class="fu"><a href="https://stringr.tidyverse.org/reference/case.html" class="external-link">str_to_lower</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu">stringi</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/stringi/man/stri_trans_general.html" class="external-link">stri_trans_general</a></span><span class="op">(</span>id <span class="op">=</span> <span class="st">"Latin-ASCII"</span><span class="op">)</span></span>
<span><span class="op">}</span></span></code></pre></div>
<p>To call this within an arrow/dplyr pipeline, it needs to be registered:</p>
<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="../reference/register_scalar_function.html">register_scalar_function</a></span><span class="op">(</span></span>
<span> name <span class="op">=</span> <span class="st">"to_snake_name"</span>,</span>
<span> fun <span class="op">=</span> <span class="va">to_snake_name</span>,</span>
<span> in_type <span class="op">=</span> <span class="fu"><a href="../reference/data-type.html">utf8</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> out_type <span class="op">=</span> <span class="fu"><a href="../reference/data-type.html">utf8</a></span><span class="op">(</span><span class="op">)</span>,</span>
<span> auto_convert <span class="op">=</span> <span class="cn">TRUE</span></span>
<span><span class="op">)</span></span></code></pre></div>
<p>In this expression, the <code>name</code> argument specifies the name by which it will be recognized in the context of the arrow/dplyr pipeline and <code>fun</code> is the function itself. The <code>in_type</code> and <code>out_type</code> arguments are used to specify the expected data type for the input and output, and <code>auto_convert</code> specifies whether arrow should automatically convert any R inputs to their Arrow equivalents.</p>
<p>Once registered, the following works:</p>
<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html" class="external-link">mutate</a></span><span class="op">(</span><span class="va">name</span>, snake_name <span class="op">=</span> <span class="fu">to_snake_name</span><span class="op">(</span><span class="va">name</span><span class="op">)</span>, .keep <span class="op">=</span> <span class="st">"none"</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 87 x 2</span></span></span>
<span><span class="co">## name snake_name </span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Skywalker luke_skywalker </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO c3po </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 r2d2 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth Vader darth_vader </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Organa leia_organa </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen Lars owen_lars </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Whitesun Lars beru_whitesun_lars</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 r5d4 </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs Darklighter biggs_darklighter </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan Kenobi obiwan_kenobi </span></span>
<span><span class="co">## <span style="color: #949494;"># i 77 more rows</span></span></span></code></pre>
<p>To learn more, see <code><a href="../reference/register_scalar_function.html">help("register_scalar_function", package = "arrow")</a></code>.</p>
</div>
<div class="section level2">
<h2 id="handling-unsupported-expressions">Handling unsupported expressions<a class="anchor" aria-label="anchor" href="#handling-unsupported-expressions"></a>
</h2>
<p>For dplyr queries on Table objects, which are held in memory and should usually be representable as data frames, if the arrow package detects an unimplemented function within a dplyr verb, it automatically calls <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect()</a></code> to return the data as an R data frame before processing that dplyr verb. As an example, neither <code><a href="https://rdrr.io/r/stats/lm.html" class="external-link">lm()</a></code> nor <code><a href="https://rdrr.io/r/stats/residuals.html" class="external-link">residuals()</a></code> are implemented, so if we write code that computes the residuals for a linear regression model, this automatic collection takes place:</p>
<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, <span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">mass</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/transmute.html" class="external-link">transmute</a></span><span class="op">(</span><span class="va">name</span>, <span class="va">height</span>, <span class="va">mass</span>, res <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/residuals.html" class="external-link">residuals</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html" class="external-link">lm</a></span><span class="op">(</span><span class="va">mass</span> <span class="op">~</span> <span class="va">height</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Warning: Expression residuals(lm(mass ~ height)) not supported in Arrow;</span></span>
<span><span class="co">## pulling data into R</span></span></code></pre>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 59 x 4</span></span></span>
<span><span class="co">## name height mass res</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Skywalker 172 77 -<span style="color: #BB0000;">18.8</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO 167 75 -<span style="color: #BB0000;">17.7</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 96 32 -<span style="color: #BB0000;">16.4</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth Vader 202 136 21.4</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Organa 150 49 -<span style="color: #BB0000;">33.1</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen Lars 178 120 20.4</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Whitesun Lars 165 75 -<span style="color: #BB0000;">16.5</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 97 32 -<span style="color: #BB0000;">17.0</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs Darklighter 183 84 -<span style="color: #BB0000;">18.7</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan Kenobi 182 77 -<span style="color: #BB0000;">25.1</span></span></span>
<span><span class="co">## <span style="color: #949494;"># i 49 more rows</span></span></span></code></pre>
<p>For queries on <code>Dataset</code> objects – which can be larger than memory – arrow is more conservative and always raises an error if it detects an unsupported expression. To illustrate this behavior, we can write the <code>starwars</code> data to disk and then open it as a Dataset. When we use the same pipeline on the Dataset, we obtain an error:</p>
<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># write and open starwars dataset</span></span>
<span><span class="va">dataset_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="../reference/write_dataset.html">write_dataset</a></span><span class="op">(</span><span class="va">starwars</span>, <span class="va">dataset_path</span><span class="op">)</span></span>
<span><span class="va">sw2</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/open_dataset.html">open_dataset</a></span><span class="op">(</span><span class="va">dataset_path</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># dplyr pipeline with unsupported expressions</span></span>
<span><span class="va">sw2</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, <span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">mass</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/transmute.html" class="external-link">transmute</a></span><span class="op">(</span><span class="va">name</span>, <span class="va">height</span>, <span class="va">mass</span>, res <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/residuals.html" class="external-link">residuals</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html" class="external-link">lm</a></span><span class="op">(</span><span class="va">mass</span> <span class="op">~</span> <span class="va">height</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Error: Expression residuals(lm(mass ~ height)) not supported in Arrow</span></span>
<span><span class="co">## Call collect() first to pull data into R.</span></span></code></pre>
<p>Calling <code><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect()</a></code> in the middle of the pipeline fixes the issue:</p>
<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw2</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, <span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">mass</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/transmute.html" class="external-link">transmute</a></span><span class="op">(</span><span class="va">name</span>, <span class="va">height</span>, <span class="va">mass</span>, res <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/residuals.html" class="external-link">residuals</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html" class="external-link">lm</a></span><span class="op">(</span><span class="va">mass</span> <span class="op">~</span> <span class="va">height</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 59 x 4</span></span></span>
<span><span class="co">## name height mass res</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Luke Skywalker 172 77 -<span style="color: #BB0000;">18.8</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> C-3PO 167 75 -<span style="color: #BB0000;">17.7</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> R2-D2 96 32 -<span style="color: #BB0000;">16.4</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Darth Vader 202 136 21.4</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Leia Organa 150 49 -<span style="color: #BB0000;">33.1</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Owen Lars 178 120 20.4</span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> Beru Whitesun Lars 165 75 -<span style="color: #BB0000;">16.5</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> R5-D4 97 32 -<span style="color: #BB0000;">17.0</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Biggs Darklighter 183 84 -<span style="color: #BB0000;">18.7</span></span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Obi-Wan Kenobi 182 77 -<span style="color: #BB0000;">25.1</span></span></span>
<span><span class="co">## <span style="color: #949494;"># i 49 more rows</span></span></span></code></pre>
<p>For some operations, you can use <a href="https://www.duckdb.org" class="external-link">DuckDB</a>. It supports Arrow natively, so you can pass the <code>Dataset</code> or query object to DuckDB without paying a performance penalty using the helper function <code><a href="../reference/to_duckdb.html">to_duckdb()</a></code> and pass the object back to Arrow with <code><a href="../reference/to_arrow.html">to_arrow()</a></code>:</p>
<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sw</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html" class="external-link">select</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html" class="external-link">is.na</a></span><span class="op">(</span><span class="va">hair_color</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="../reference/to_duckdb.html">to_duckdb</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html" class="external-link">group_by</a></span><span class="op">(</span><span class="va">hair_color</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">height</span> <span class="op">&lt;</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html" class="external-link">mean</a></span><span class="op">(</span><span class="va">height</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="fu"><a href="../reference/to_arrow.html">to_arrow</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span></span>
<span> <span class="co"># perform other arrow operations...</span></span>
<span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## <span style="color: #949494;"># A tibble: 28 x 4</span></span></span>
<span><span class="co">## name height mass hair_color</span></span>
<span><span class="co">## <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> <span style="color: #949494; font-style: italic;">&lt;int&gt;</span> <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span> </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 1</span> Yoda 66 17 white </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 2</span> Watto 137 <span style="color: #BB0000;">NA</span> black </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 3</span> Shmi Skywalker 163 <span style="color: #BB0000;">NA</span> black </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 4</span> Eeth Koth 171 <span style="color: #BB0000;">NA</span> black </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 5</span> Luminara Unduli 170 56.2 black </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 6</span> Barriss Offee 166 50 black </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 7</span> R4-P17 96 <span style="color: #BB0000;">NA</span> none </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 8</span> Lobot 175 79 none </span></span>
<span><span class="co">## <span style="color: #BCBCBC;"> 9</span> Ackbar 180 83 none </span></span>
<span><span class="co">## <span style="color: #BCBCBC;">10</span> Nien Nunb 160 68 none </span></span>
<span><span class="co">## <span style="color: #949494;"># i 18 more rows</span></span></span></code></pre>
</div>
<div class="section level2">
<h2 id="further-reading">Further reading<a class="anchor" aria-label="anchor" href="#further-reading"></a>
</h2>
<ul>
<li>To learn more about multi-file datasets, see the <a href="./dataset.html">dataset article</a>.</li>
<li>To learn more about user-registered functions, see <code><a href="../reference/register_scalar_function.html">help("register_scalar_function", package = "arrow")</a></code>.</li>
<li>To learn more about writing dplyr bindings as an arrow developer, see the <a href="./developers/writing_bindings.html">article on writing bindings</a>.</li>
</ul>
</div>
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
</nav></aside>
</div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p>
</div>
</footer>
</div>
</body>
</html>