blob: 59c08e370597b90454e75d83ca11cf5fc388a1f3 [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="description" content='Apache Arrow defines two formats for serializing data for interprocess communication (IPC):
a "stream" format and a "file" format, known as Feather.
RecordBatchStreamReader and RecordBatchFileReader are
interfaces for accessing record batches from input sources in those formats,
respectively.
For guidance on how to use these classes, see the examples section.'><title>RecordBatchReader classes — RecordBatchReader • Arrow R Package</title><!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png"><link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png"><link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png"><link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png"><link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png"><link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png"><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.2.2/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.2.2/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><script src="../extra.js"></script><meta property="og:title" content="RecordBatchReader classes — RecordBatchReader"><meta property="og:description" content='Apache Arrow defines two formats for serializing data for interprocess communication (IPC):
a "stream" format and a "file" format, known as Feather.
RecordBatchStreamReader and RecordBatchFileReader are
interfaces for accessing record batches from input sources in those formats,
respectively.
For guidance on how to use these classes, see the examples section.'><meta property="og:image" content="https://arrow.apache.org/img/arrow-logo_horizontal_black-txt_white-bg.png"><meta property="og:image:alt" content="Apache Arrow logo, displaying the triple chevron image adjacent to the text"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:creator" content="@apachearrow"><meta name="twitter:site" content="@apachearrow"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--><!-- Matomo --><script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script><!-- End Matomo Code --></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-black"><div class="container">
<a class="navbar-brand me-2" href="../index.html">Arrow R Package</a>
<span class="version">
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">11.0.0</small>
</span>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto"><li class="nav-item">
<a class="nav-link" href="../articles/arrow.html">Get started</a>
</li>
<li class="active nav-item">
<a class="nav-link" href="../reference/index.html">Reference</a>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-articles">Articles</a>
<div class="dropdown-menu" aria-labelledby="dropdown-articles">
<h6 class="dropdown-header" data-toc-skip>Using the package</h6>
<a class="dropdown-item" href="../articles/read_write.html">Reading and writing data files</a>
<a class="dropdown-item" href="../articles/data_wrangling.html">Data analysis with dplyr syntax</a>
<a class="dropdown-item" href="../articles/dataset.html">Working with multi-file data sets</a>
<a class="dropdown-item" href="../articles/python.html">Integrating Arrow, Python, and R</a>
<a class="dropdown-item" href="../articles/fs.html">Using cloud storage (S3, GCS)</a>
<a class="dropdown-item" href="../articles/flight.html">Connecting to a Flight server</a>
<div class="dropdown-divider"></div>
<h6 class="dropdown-header" data-toc-skip>Arrow concepts</h6>
<a class="dropdown-item" href="../articles/data_objects.html">Data objects</a>
<a class="dropdown-item" href="../articles/data_types.html">Data types</a>
<a class="dropdown-item" href="../articles/metadata.html">Metadata</a>
<div class="dropdown-divider"></div>
<h6 class="dropdown-header" data-toc-skip>Installation</h6>
<a class="dropdown-item" href="../articles/install.html">Installing on Linux</a>
<a class="dropdown-item" href="../articles/install_nightly.html">Installing development versions</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="../articles/index.html">More articles...</a>
</div>
</li>
<li class="nav-item">
<a class="nav-link" href="../news/index.html">Changelog</a>
</li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
<ul class="navbar-nav"></ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<img src="" class="logo" alt=""><h1>RecordBatchReader classes</h1>
<small class="dont-index">Source: <a href="https://github.com/apache/arrow/blob/master/r/R/record-batch-reader.R" class="external-link"><code>R/record-batch-reader.R</code></a></small>
<div class="d-none name"><code>RecordBatchReader.Rd</code></div>
</div>
<div class="ref-description section level2">
<p>Apache Arrow defines two formats for <a href="https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc" class="external-link">serializing data for interprocess communication (IPC)</a>:
a "stream" format and a "file" format, known as Feather.
<code>RecordBatchStreamReader</code> and <code>RecordBatchFileReader</code> are
interfaces for accessing record batches from input sources in those formats,
respectively.</p>
<p>For guidance on how to use these classes, see the examples section.</p>
</div>
<div class="section level2">
<h2 id="factory">Factory<a class="anchor" aria-label="anchor" href="#factory"></a></h2>
<p>The <code>RecordBatchFileReader$create()</code> and <code>RecordBatchStreamReader$create()</code>
factory methods instantiate the object and
take a single argument, named according to the class:</p><ul><li><p><code>file</code> A character file name, raw vector, or Arrow file connection object
(e.g. <a href="InputStream.html">RandomAccessFile</a>).</p></li>
<li><p><code>stream</code> A raw vector, <a href="buffer.html">Buffer</a>, or <a href="InputStream.html">InputStream</a>.</p></li>
</ul></div>
<div class="section level2">
<h2 id="methods">Methods<a class="anchor" aria-label="anchor" href="#methods"></a></h2>
<ul><li><p><code>$read_next_batch()</code>: Returns a <code>RecordBatch</code>, iterating through the
Reader. If there are no further batches in the Reader, it returns <code>NULL</code>.</p></li>
<li><p><code>$schema</code>: Returns a <a href="Schema.html">Schema</a> (active binding)</p></li>
<li><p><code>$batches()</code>: Returns a list of <code>RecordBatch</code>es</p></li>
<li><p><code>$read_table()</code>: Collects the reader's <code>RecordBatch</code>es into a <a href="Table.html">Table</a></p></li>
<li><p><code>$get_batch(i)</code>: For <code>RecordBatchFileReader</code>, return a particular batch
by an integer index.</p></li>
<li><p><code>$num_record_batches()</code>: For <code>RecordBatchFileReader</code>, see how many batches
are in the file.</p></li>
</ul></div>
<div class="section level2">
<h2 id="see-also">See also<a class="anchor" aria-label="anchor" href="#see-also"></a></h2>
<div class="dont-index"><p><code><a href="read_ipc_stream.html">read_ipc_stream()</a></code> and <code><a href="read_feather.html">read_feather()</a></code> provide a much simpler interface
for reading data from these formats and are sufficient for many use cases.</p></div>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="va">tf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/on.exit.html" class="external-link">on.exit</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/unlink.html" class="external-link">unlink</a></span><span class="op">(</span><span class="va">tf</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">batch</span> <span class="op">&lt;-</span> <span class="fu"><a href="RecordBatch.html">record_batch</a></span><span class="op">(</span><span class="va">chickwts</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># This opens a connection to the file in Arrow</span></span></span>
<span class="r-in"><span><span class="va">file_obj</span> <span class="op">&lt;-</span> <span class="va">FileOutputStream</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="va">tf</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># Pass that to a RecordBatchWriter to write data conforming to a schema</span></span></span>
<span class="r-in"><span><span class="va">writer</span> <span class="op">&lt;-</span> <span class="va">RecordBatchFileWriter</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="va">file_obj</span>, <span class="va">batch</span><span class="op">$</span><span class="va">schema</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">writer</span><span class="op">$</span><span class="fu">write</span><span class="op">(</span><span class="va">batch</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># You may write additional batches to the stream, provided that they have</span></span></span>
<span class="r-in"><span><span class="co"># the same schema.</span></span></span>
<span class="r-in"><span><span class="co"># Call "close" on the writer to indicate end-of-file/stream</span></span></span>
<span class="r-in"><span><span class="va">writer</span><span class="op">$</span><span class="fu">close</span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># Then, close the connection--closing the IPC message does not close the file</span></span></span>
<span class="r-in"><span><span class="va">file_obj</span><span class="op">$</span><span class="fu">close</span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># Now, we have a file we can read from. Same pattern: open file connection,</span></span></span>
<span class="r-in"><span><span class="co"># then pass it to a RecordBatchReader</span></span></span>
<span class="r-in"><span><span class="va">read_file_obj</span> <span class="op">&lt;-</span> <span class="va">ReadableFile</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="va">tf</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">reader</span> <span class="op">&lt;-</span> <span class="va">RecordBatchFileReader</span><span class="op">$</span><span class="fu">create</span><span class="op">(</span><span class="va">read_file_obj</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># RecordBatchFileReader knows how many batches it has (StreamReader does not)</span></span></span>
<span class="r-in"><span><span class="va">reader</span><span class="op">$</span><span class="va">num_record_batches</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> [1] 1</span>
<span class="r-in"><span><span class="co"># We could consume the Reader by calling $read_next_batch() until all are,</span></span></span>
<span class="r-in"><span><span class="co"># consumed, or we can call $read_table() to pull them all into a Table</span></span></span>
<span class="r-in"><span><span class="va">tab</span> <span class="op">&lt;-</span> <span class="va">reader</span><span class="op">$</span><span class="fu">read_table</span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># Call as.data.frame to turn that Table into an R data.frame</span></span></span>
<span class="r-in"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/as.data.frame.html" class="external-link">as.data.frame</a></span><span class="op">(</span><span class="va">tab</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># This should be the same data we sent</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html" class="external-link">all.equal</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">chickwts</span>, check.attributes <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#&gt;</span> [1] TRUE</span>
<span class="r-in"><span><span class="co"># Unlike the Writers, we don't have to close RecordBatchReaders,</span></span></span>
<span class="r-in"><span><span class="co"># but we do still need to close the file connection</span></span></span>
<span class="r-in"><span><span class="va">read_file_obj</span><span class="op">$</span><span class="fu">close</span><span class="op">(</span><span class="op">)</span></span></span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="pkgdown-footer-left">
<p></p><p><a href="https://arrow.apache.org/docs/r/versions.html">Older versions of these docs</a></p>
</div>
<div class="pkgdown-footer-right">
<p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.7.</p>
</div>
</footer></div>
</body></html>