blob: 3a06b2c340fd500985e382afe07b0163d56c5cda [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en-US">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Internal structure of Arrow objects • Arrow R Package</title>
<!-- favicons --><link rel="icon" type="image/png" sizes="96x96" href="../../favicon-96x96.png">
<link rel="icon" type="”image/svg+xml”" href="../../favicon.svg">
<link rel="apple-touch-icon" sizes="180x180" href="../../apple-touch-icon.png">
<link rel="icon" sizes="any" href="../../favicon.ico">
<link rel="manifest" href="../../site.webmanifest">
<script src="../../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link href="../../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet">
<script src="../../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><link href="../../deps/font-awesome-6.5.2/css/all.min.css" rel="stylesheet">
<link href="../../deps/font-awesome-6.5.2/css/v4-shims.min.css" rel="stylesheet">
<script src="../../deps/headroom-0.11.0/headroom.min.js"></script><script src="../../deps/headroom-0.11.0/jQuery.headroom.min.js"></script><script src="../../deps/bootstrap-toc-1.0.1/bootstrap-toc.min.js"></script><script src="../../deps/clipboard.js-2.0.11/clipboard.min.js"></script><script src="../../deps/search-1.0.0/autocomplete.jquery.min.js"></script><script src="../../deps/search-1.0.0/fuse.min.js"></script><script src="../../deps/search-1.0.0/mark.min.js"></script><!-- pkgdown --><script src="../../pkgdown.js"></script><link href="../../extra.css" rel="stylesheet">
<meta property="og:title" content="Internal structure of Arrow objects">
<meta name="description" content="Learn about the internal structure of Arrow data objects.
">
<meta property="og:description" content="Learn about the internal structure of Arrow data objects.
">
<meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png">
<meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text">
<!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --><!-- Kapa AI --><script async src="https://widget.kapa.ai/kapa-widget.bundle.js" data-website-id="9db461d5-ac77-4b3f-a5c5-75efa78339d2" data-project-name="Apache Arrow" data-project-color="#000000" data-project-logo="https://arrow.apache.org/img/arrow-logo_chevrons_white-txt_black-bg.png" data-modal-disclaimer="This is a custom LLM with access to all of [Arrow documentation](https://arrow.apache.org/docs/). If you want an R-specific answer, please mention this in your question." data-consent-required="true" data-user-analytics-cookie-enabled="false" data-consent-screen-disclaimer="By clicking &quot;I agree, let's chat&quot;, you consent to the use of the AI assistant in accordance with kapa.ai's [Privacy Policy](https://www.kapa.ai/content/privacy-policy). This service uses reCAPTCHA, which requires your consent to Google's [Privacy Policy](https://policies.google.com/privacy) and [Terms of Service](https://policies.google.com/terms). By proceeding, you explicitly agree to both kapa.ai's and Google's privacy policies."></script><!-- End Kapa AI -->
</head>
<body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">22.0.0.9000</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a class="nav-link" href="../../articles/arrow.html">Get started</a></li>
<li class="nav-item"><a class="nav-link" href="../../reference/index.html">Reference</a></li>
<li class="active nav-item dropdown">
<button class="nav-link dropdown-toggle" type="button" id="dropdown-articles" data-bs-toggle="dropdown" aria-expanded="false" aria-haspopup="true">Articles</button>
<ul class="dropdown-menu" aria-labelledby="dropdown-articles">
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Using the package</h6></li>
<li><a class="dropdown-item" href="../../articles/read_write.html">Reading and writing data files</a></li>
<li><a class="dropdown-item" href="../../articles/data_wrangling.html">Data analysis with dplyr syntax</a></li>
<li><a class="dropdown-item" href="../../articles/dataset.html">Working with multi-file data sets</a></li>
<li><a class="dropdown-item" href="../../articles/python.html">Integrating Arrow, Python, and R</a></li>
<li><a class="dropdown-item" href="../../articles/fs.html">Using cloud storage (S3, GCS)</a></li>
<li><a class="dropdown-item" href="../../articles/flight.html">Connecting to a Flight server</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6></li>
<li><a class="dropdown-item" href="../../articles/data_objects.html">Data objects</a></li>
<li><a class="dropdown-item" href="../../articles/data_types.html">Data types</a></li>
<li><a class="dropdown-item" href="../../articles/metadata.html">Metadata</a></li>
<li><hr class="dropdown-divider"></li>
<li><h6 class="dropdown-header" data-toc-skip>Installation</h6></li>
<li><a class="dropdown-item" href="../../articles/install.html">Installing on Linux</a></li>
<li><a class="dropdown-item" href="../../articles/install_nightly.html">Installing development versions</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="../../articles/index.html">More articles...</a></li>
</ul>
</li>
<li class="nav-item"><a class="nav-link" href="../../news/index.html">Changelog</a></li>
</ul>
<form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../../search.json" id="search-input" placeholder="" autocomplete="off">
</form>
<ul class="navbar-nav">
<li class="nav-item"><a class="external-link nav-link" href="https://github.com/apache/arrow/" aria-label="GitHub"><span class="fa fab fa-github fa-lg"></span></a></li>
</ul>
</div>
</div>
</nav><div class="container template-article">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<h1>Internal structure of Arrow objects</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/main/r/vignettes/developers/data_object_layout.Rmd" class="external-link"><code>vignettes/developers/data_object_layout.Rmd</code></a></small>
<div class="d-none name"><code>data_object_layout.Rmd</code></div>
</div>
<p>This article describes the internal structure of Arrow data objects.
Users of the arrow R package will not generally need to understand the
internal structure of Arrow data objects. We include it here to help
orient those R users and Arrow developers who wish to understand the <a href="https://arrow.apache.org/docs/format/Columnar.html" class="external-link">Arrow
specification</a>. This article provides a deeper dive into some of the
topics described in the <a href="../data_objects.html">data objects
article</a>, and is intended mostly for developers. It is not necessary
knowledge for using the arrow package.</p>
<p>We begin by describing two key concepts:</p>
<ul>
<li>Values in an array are stored in one or more
<strong>buffers</strong>. A buffer is a sequential virtual address space
(i.e., block of memory) with a given length. Given a pointer specifying
the memory address where the buffer starts, you can reach any byte in
the buffer with an “offset” value that specifies a location relative to
the start of the buffer.</li>
<li>The <strong>physical layout</strong> of an array is a term used to
describe how data in an array is laid out in memory, without taking into
account how that information is interpreted. As an example: a 32-bit
signed integer and 32-bit floating point number have the same layout:
they are both 32 bits, represented as 4 contiguous bytes in memory. The
meaning is different, but the layout is the same.</li>
</ul>
<p>We can unpack these ideas using a simple array of integer values:</p>
<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">integer_array</span> <span class="op">&lt;-</span> <span class="va">Array</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1L</span>, <span class="cn">NA</span>, <span class="fl">2L</span>, <span class="fl">4L</span>, <span class="fl">8L</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">integer_array</span></span></code></pre></div>
<pre><code><span><span class="co">## Array</span></span>
<span><span class="co">## &lt;int32&gt;</span></span>
<span><span class="co">## [</span></span>
<span><span class="co">## 1,</span></span>
<span><span class="co">## null,</span></span>
<span><span class="co">## 2,</span></span>
<span><span class="co">## 4,</span></span>
<span><span class="co">## 8</span></span>
<span><span class="co">## ]</span></span></code></pre>
<p>We can inspect the <code>integer_array$type</code> attribute to see
that the values in the Array are stored as signed 32 bit integers. When
laid out in memory by the Arrow C++ library, an integer array consists
of two pieces of metadata and two buffers that store the data. The
metadata specify the length of the array and a count of the number of
null values, both stored as 64-bit integers. These metadata can be
viewed from R using <code>integer_array$length()</code> and
<code>integer_array$null_count</code> respectively. The number of
buffers associated with an array depends on the exact type of data being
stored. For an integer array there are two: a “validity bitmap buffer”
and a “data value buffer”. Schematically we could depict the array as
follows:</p>
<p><img src="array_layout_integer.png" width="100%"></p>
<p>This image shows the array as a rectangle subdivided into two parts,
one for the metadata and the other for the buffers. Underneath the
rectangle we’ve unpacked the contents of the buffers for you, showing
the contents of the two buffers in the area enclosed in a dotted line.
At the very bottom of the figure, you can see the contents of specific
bytes.</p>
<div class="section level2">
<h2 id="validity-bitmap-buffer">Validity bitmap buffer<a class="anchor" aria-label="anchor" href="#validity-bitmap-buffer"></a>
</h2>
<p>The validity bitmap is binary-valued, and contains a 1 whenever the
corresponding slot in the array contains a valid, non-null value. At an
abstract level we can assume this contains the following five bits:</p>
<pre><code><span><span class="fl">10111</span></span></code></pre>
<p>However this is a slight over-simplification for three reasons.
First, because memory is allocated in byte-size units there are three
trailing bits at the end (assumed to be zero), giving us the bitmap
<code>10111000</code>. Second, while we have written this from
left-to-right, this written format is typically presumed to represent <a href="https://en.wikipedia.org/wiki/Endianness" class="external-link">big endian format</a>
where the most-significant bit is written first (i.e., to the
lowest-valued memory address). Arrow adopts a little-endian convention,
which would more naturally correspond toa right-to-left ordering when
written in English. To reflect this we write the bits in right-to-left
order: <code>00011101</code>. Finally, Arrow encourages <a href="https://en.wikipedia.org/wiki/Data_structure_alignment" class="external-link">naturally
aligned data structures</a> in which allocated memory addresses are a
multiple of the data block sizes. Arrow uses <em>64 byte alignment</em>,
so each data structure must be a multiple of 64 bytes in size. This
design feature exists to allow efficient use of modern hardware, as
discussed in the <a href="https://arrow.apache.org/docs/format/Columnar.html#buffer-alignment-and-padding" class="external-link">Arrow
specification</a>. This is what the buffer looks like this in
memory:</p>
<div class="grid">
<div class="g-col-6">
<table class="table">
<thead><tr class="header">
<th>Byte 0 (validity bitmap)</th>
<th>Bytes 1-63</th>
</tr></thead>
<tbody><tr class="odd">
<td><code>00011101</code></td>
<td>
<code>0</code> (padding)</td>
</tr></tbody>
</table>
</div>
</div>
</div>
<div class="section level2">
<h2 id="data-buffer">Data buffer<a class="anchor" aria-label="anchor" href="#data-buffer"></a>
</h2>
<p>The data buffer, like the validity bitmap, is padded out to a length
of 64 bytes to preserve natural alignment. Here’s the diagram showing
the physical layout:</p>
<div class="grid">
<div class="g-col-12">
<table class="table">
<colgroup>
<col width="14%">
<col width="17%">
<col width="16%">
<col width="17%">
<col width="17%">
<col width="17%">
</colgroup>
<thead><tr class="header">
<th>Bytes 0-3</th>
<th>Bytes 4-7</th>
<th>Bytes 8-11</th>
<th>Bytes 12-15</th>
<th>Bytes 16-19</th>
<th>Bytes 20-63</th>
</tr></thead>
<tbody><tr class="odd">
<td><code>1</code></td>
<td>unspecified</td>
<td><code>2</code></td>
<td><code>4</code></td>
<td><code>8</code></td>
<td>unspecified</td>
</tr></tbody>
</table>
</div>
</div>
<p>Each integer occupies 4 bytes, as per the requirements of a 32-bit
signed integer. Notice that the bytes associated with the missing value
are left unspecified: space is allocated for the value but those bytes
are not filled.</p>
</div>
<div class="section level2">
<h2 id="offset-buffer">Offset buffer<a class="anchor" aria-label="anchor" href="#offset-buffer"></a>
</h2>
<p>Some types of Arrow array include a third buffer known as the offset
buffer. This is most frequently encountered in the context of string
arrays, such as this one:</p>
<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">string_array</span> <span class="op">&lt;-</span> <span class="va">Array</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"hello"</span>, <span class="st">"amazing"</span>, <span class="st">"and"</span>, <span class="st">"cruel"</span>, <span class="st">"world"</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">string_array</span></span></code></pre></div>
<pre><code><span><span class="co">## Array</span></span>
<span><span class="co">## &lt;string&gt;</span></span>
<span><span class="co">## [</span></span>
<span><span class="co">## "hello",</span></span>
<span><span class="co">## "amazing",</span></span>
<span><span class="co">## "and",</span></span>
<span><span class="co">## "cruel",</span></span>
<span><span class="co">## "world"</span></span>
<span><span class="co">## ]</span></span></code></pre>
<p>Using the same schematic notation as before, this is the structure of
the object. It has the same metadata as before but as shown below, there
are now three buffers:</p>
<p><img src="array_layout_string.png" width="100%"></p>
<p>To understand the role of the offset buffer, it helps to note the
format of the data buffer for a string array: it concatenates all
strings end to end in one contiguous section of memory. For the
<code>string_array</code> object, the contents of the data buffer would
look like one long utf8-encoded string:</p>
<pre><code><span><span class="va">helloamazingandcruelworld</span></span></code></pre>
<p>Because individual strings can be of variable length, the role of the
offset buffer is to specify where the boundaries between the slots are.
The second slot in our array is the string <code>"amazing"</code>. If
the positions in the data array are indexed like this</p>
<table style="width:100%;" class="table">
<colgroup>
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="6%">
<col width="9%">
</colgroup>
<thead><tr class="header">
<th>h</th>
<th>e</th>
<th>l</th>
<th>l</th>
<th>o</th>
<th>a</th>
<th>m</th>
<th>a</th>
<th>z</th>
<th>i</th>
<th>n</th>
<th>g</th>
<th>a</th>
<th>n</th>
<th>d</th>
<th></th>
</tr></thead>
<tbody><tr class="odd">
<td>0</td>
<td>1</td>
<td>2</td>
<td>3</td>
<td>4</td>
<td>5</td>
<td>6</td>
<td>7</td>
<td>8</td>
<td>9</td>
<td>10</td>
<td>11</td>
<td>12</td>
<td>13</td>
<td>14</td>
<td></td>
</tr></tbody>
</table>
<p>then we can see that the string of interest begins at position 5 and
ends at position 11. The offset buffer consists integers that store
these break point locations. For <code>string_array</code> it might look
like this:</p>
<pre><code>0 5 12 15 20 25</code></pre>
<p>The difference between the <code><a href="../../reference/data-type.html">utf8()</a></code> data type and the
<code><a href="../../reference/data-type.html">large_utf8()</a></code> data type is that these the
<code><a href="../../reference/data-type.html">utf8()</a></code> data type stores these as 32-bit integers whereas
the <code><a href="../../reference/data-type.html">large_utf8()</a></code> type stores them as 64-bit integers.</p>
</div>
<div class="section level2">
<h2 id="chunked-arrays">Chunked arrays<a class="anchor" aria-label="anchor" href="#chunked-arrays"></a>
</h2>
<p>Arrays are immutable objects: once an Array has been initialized the
values it stores cannot be altered. This ensures that multiple entities
can safely refer to an Array via pointers, and not run the risk that the
values will change. Using immutable Arrays makes it possible for Arrow
to avoid unnecessary copies of data objects.</p>
<p>There are limitations to immutable Arrays, most notably when new
batches of data arrive. Because an array is immutable, you can’t add the
new information to an existing array. The only thing you can do if you
don’t want to disturb or copy your existing array is create a new array
that contains the new data. Doing that preserves the immutability of
arrays and doesn’t lead to any unnecessary copying but now we have a new
problem: the data are split across two arrays. Each array contains only
one “chunk” of the data. What would be ideal is an abstraction layer
that allows us to treat these two Arrays as though they were a single
“Array-like” object.</p>
<p>This is the problem that chunked arrays solve. A chunked array is a
wrapper around a list of arrays, and allows you to index their contents
“as if” they were a single array. Physically, the data are still stored
in separate places – each array is one chunk, and these chunks don’t
have to be adjacent to each other in memory – but the chunked array
provides us will a layer of abstraction that allows us to pretend that
they are all one thing.</p>
<p>To illustrate, let’s use the <code><a href="../../reference/chunked_array.html">chunked_array()</a></code>
function:</p>
<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">chunked_string_array</span> <span class="op">&lt;-</span> <span class="fu"><a href="../../reference/chunked_array.html">chunked_array</a></span><span class="op">(</span></span>
<span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"hello"</span>, <span class="st">"amazing"</span>, <span class="st">"and"</span>, <span class="st">"cruel"</span>, <span class="st">"world"</span><span class="op">)</span>,</span>
<span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"I"</span>, <span class="st">"love"</span>, <span class="st">"you"</span><span class="op">)</span></span>
<span><span class="op">)</span></span></code></pre></div>
<p>The <code><a href="../../reference/chunked_array.html">chunked_array()</a></code> function is just a wrapper around
the functionality that <code>ChunkedArray$create()</code> provides.
Let’s take a look at the object:</p>
<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">chunked_string_array</span></span></code></pre></div>
<pre><code><span><span class="co">## ChunkedArray</span></span>
<span><span class="co">## &lt;string&gt;</span></span>
<span><span class="co">## [</span></span>
<span><span class="co">## [</span></span>
<span><span class="co">## "hello",</span></span>
<span><span class="co">## "amazing",</span></span>
<span><span class="co">## "and",</span></span>
<span><span class="co">## "cruel",</span></span>
<span><span class="co">## "world"</span></span>
<span><span class="co">## ],</span></span>
<span><span class="co">## [</span></span>
<span><span class="co">## "I",</span></span>
<span><span class="co">## "love",</span></span>
<span><span class="co">## "you"</span></span>
<span><span class="co">## ]</span></span>
<span><span class="co">## ]</span></span></code></pre>
<p>The double bracketing in this output is intended to highlight the
“list-like” nature of chunked arrays. There are three separate arrays,
wrapped in a container object that is secretly a list of arrays, but
allows that list to behave just like a regular one-dimensional data
structure. Schematically it looks like this:</p>
<p><img src="chunked_array_layout.png" width="100%"></p>
<p>As this figure illustrates, there really are three arrays here, each
with its own validity bitmap, offset buffer, and data buffer.</p>
</div>
<div class="section level2">
<h2 id="record-batches">Record batches<a class="anchor" aria-label="anchor" href="#record-batches"></a>
</h2>
<p>A record batch is table-like data structure comprised of a sequence
of arrays. The arrays can be of different types but they must all be the
same length. Each array is referred to as one of the “fields” or
“columns” of the record batch. Each field must have a (UTF8-encoded)
name, and these names form part of the metadata for the record batch.
When stored in memory, the record batch does not include physical
storage for the values stored in each field: instead it contains
pointers to the relevant array objects. It does, however, contain its
own validity bitmap.</p>
<p>Here is a record batch containing 5 rows and 3 columns:</p>
<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">rb</span> <span class="op">&lt;-</span> <span class="fu"><a href="../../reference/record_batch.html">record_batch</a></span><span class="op">(</span></span>
<span> strs <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"hello"</span>, <span class="st">"amazing"</span>, <span class="st">"and"</span>, <span class="st">"cruel"</span>, <span class="st">"world"</span><span class="op">)</span>,</span>
<span> ints <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1L</span>, <span class="cn">NA</span>, <span class="fl">2L</span>, <span class="fl">4L</span>, <span class="fl">8L</span><span class="op">)</span>,</span>
<span> dbls <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1.1</span>, <span class="fl">3.2</span>, <span class="fl">0.2</span>, <span class="cn">NA</span>, <span class="fl">11</span><span class="op">)</span></span>
<span><span class="op">)</span></span>
<span><span class="va">rb</span></span></code></pre></div>
<pre><code><span><span class="co">## RecordBatch</span></span>
<span><span class="co">## 5 rows x 3 columns</span></span>
<span><span class="co">## $strs &lt;string&gt;</span></span>
<span><span class="co">## $ints &lt;int32&gt;</span></span>
<span><span class="co">## $dbls &lt;double&gt;</span></span></code></pre>
<p>At an abstract level the <code>rb</code> object behaves like a two
dimensional structure with rows and columns, but in terms of how it is
represented in memory it is fundamentally a list of arrays as shown
below:</p>
<p><img src="record_batch_layout.png" width="100%"></p>
</div>
<div class="section level2">
<h2 id="tables">Tables<a class="anchor" aria-label="anchor" href="#tables"></a>
</h2>
<p>To deal with situations where a rectangular data set can grow over
time (as more data are added), we need a tabular data structure that is
similar to a record batch with one exception: instead of storing each
column as an array, we now want to store it as a chunked array. This is
what the <code>Table</code> class in <strong>arrow</strong> does.</p>
<p>To illustrate, suppose we have a second set of data that arrives as a
record batch:</p>
<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">new_rb</span> <span class="op">&lt;-</span> <span class="fu"><a href="../../reference/record_batch.html">record_batch</a></span><span class="op">(</span></span>
<span> strs <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"I"</span>, <span class="st">"love"</span>, <span class="st">"you"</span><span class="op">)</span>,</span>
<span> ints <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">5L</span>, <span class="fl">0L</span>, <span class="fl">0L</span><span class="op">)</span>,</span>
<span> dbls <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">7.1</span>, <span class="op">-</span><span class="fl">0.1</span>, <span class="fl">2</span><span class="op">)</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="../../reference/concat_tables.html">concat_tables</a></span><span class="op">(</span><span class="fu"><a href="../../reference/table.html">arrow_table</a></span><span class="op">(</span><span class="va">rb</span><span class="op">)</span>, <span class="fu"><a href="../../reference/table.html">arrow_table</a></span><span class="op">(</span><span class="va">new_rb</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">df</span></span></code></pre></div>
<pre><code><span><span class="co">## Table</span></span>
<span><span class="co">## 8 rows x 3 columns</span></span>
<span><span class="co">## $strs &lt;string&gt;</span></span>
<span><span class="co">## $ints &lt;int32&gt;</span></span>
<span><span class="co">## $dbls &lt;double&gt;</span></span></code></pre>
<p>Here is the underlying structure of this Table:</p>
<p><img src="table_layout.png" width="100%"></p>
</div>
</main><aside class="col-md-3"><nav id="toc" aria-label="Table of contents"><h2>On this page</h2>
</nav></aside>
</div>
<footer><div class="pkgdown-footer-left">
<p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.1.3.</p>
</div>
</footer>
</div>
</body>
</html>