blob: e398ee2a34d8369de1e313056f664c51267d8878 [file] [log] [blame]
<!-- Generated by pkgdown: do not edit by hand -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Multi-file datasets — Dataset • Arrow R Package</title>
<!-- jquery -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script>
<!-- Bootstrap -->
<link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script>
<!-- bootstrap-toc -->
<link rel="stylesheet" href="../bootstrap-toc.css">
<script src="../bootstrap-toc.js"></script>
<!-- Font Awesome icons -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous" />
<!-- clipboard.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script>
<!-- headroom.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script>
<!-- pkgdown -->
<link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script>
<script src="../extra.js"></script>
<meta property="og:title" content="Multi-file datasets — Dataset" />
<meta property="og:description" content="Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files).
A Dataset contains one or more Fragments, such as files, of potentially
differing type and partitioning.
For Dataset$create(), see open_dataset(), which is an alias for it.
DatasetFactory is used to provide finer control over the creation of Datasets." />
<!-- mathjax -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-spy="scroll" data-target="#toc">
<div class="container template-reference-topic">
<header>
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">Arrow R Package</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">4.0.1</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="https://arrow.apache.org/">❯❯❯</a>
</li>
<li>
<a href="../articles/arrow.html">Get started</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="../articles/install.html">Installing the Arrow Package on Linux</a>
</li>
<li>
<a href="../articles/dataset.html">Working with Arrow Datasets and dplyr</a>
</li>
<li>
<a href="../articles/fs.html">Working with Cloud Storage (S3)</a>
</li>
<li>
<a href="../articles/python.html">Apache Arrow in Python and R with reticulate</a>
</li>
<li>
<a href="../articles/flight.html">Connecting to Flight RPC Servers</a>
</li>
<li>
<a href="../articles/developing.html">Arrow R Developer Guide</a>
</li>
</ul>
</li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Project docs
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="https://arrow.apache.org/docs/format/README.html">Specification</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/c_glib">C GLib</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/cpp">C++</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/java">Java</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/js">JavaScript</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/python">Python</a>
</li>
<li>
<a href="../index.html">R</a>
</li>
</ul>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header>
<div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>Multi-file datasets</h1>
<small class="dont-index">Source: <a href='https://github.com/apache/arrow/blob/master/r/R/dataset.R'><code>R/dataset.R</code></a>, <a href='https://github.com/apache/arrow/blob/master/r/R/dataset-factory.R'><code>R/dataset-factory.R</code></a></small>
<div class="hidden name"><code>Dataset.Rd</code></div>
</div>
<div class="ref-description">
<p>Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files).</p>
<p>A <code>Dataset</code> contains one or more <code>Fragments</code>, such as files, of potentially
differing type and partitioning.</p>
<p>For <code>Dataset$create()</code>, see <code><a href='open_dataset.html'>open_dataset()</a></code>, which is an alias for it.</p>
<p><code>DatasetFactory</code> is used to provide finer control over the creation of <code>Dataset</code>s.</p>
</div>
<h2 class="hasAnchor" id="factory"><a class="anchor" href="#factory"></a>Factory</h2>
<p><code>DatasetFactory</code> is used to create a <code>Dataset</code>, inspect the <a href='Schema.html'>Schema</a> of the
fragments contained in it, and declare a partitioning.
<code>FileSystemDatasetFactory</code> is a subclass of <code>DatasetFactory</code> for
discovering files in the local file system, the only currently supported
file system.</p>
<p>For the <code>DatasetFactory$create()</code> factory method, see <code><a href='dataset_factory.html'>dataset_factory()</a></code>, an
alias for it. A <code>DatasetFactory</code> has:</p><ul>
<li><p><code>$Inspect(unify_schemas)</code>: If <code>unify_schemas</code> is <code>TRUE</code>, all fragments
will be scanned and a unified <a href='Schema.html'>Schema</a> will be created from them; if <code>FALSE</code>
(default), only the first fragment will be inspected for its schema. Use this
fast path when you know and trust that all fragments have an identical schema.</p></li>
<li><p><code>$Finish(schema, unify_schemas)</code>: Returns a <code>Dataset</code>. If <code>schema</code> is provided,
it will be used for the <code>Dataset</code>; if omitted, a <code>Schema</code> will be created from
inspecting the fragments (files) in the dataset, following <code>unify_schemas</code>
as described above.</p></li>
</ul>
<p><code>FileSystemDatasetFactory$create()</code> is a lower-level factory method and
takes the following arguments:</p><ul>
<li><p><code>filesystem</code>: A <a href='FileSystem.html'>FileSystem</a></p></li>
<li><p><code>selector</code>: Either a <a href='FileSelector.html'>FileSelector</a> or <code>NULL</code></p></li>
<li><p><code>paths</code>: Either a character vector of file paths or <code>NULL</code></p></li>
<li><p><code>format</code>: A <a href='FileFormat.html'>FileFormat</a></p></li>
<li><p><code>partitioning</code>: Either <code>Partitioning</code>, <code>PartitioningFactory</code>, or <code>NULL</code></p></li>
</ul>
<h2 class="hasAnchor" id="methods"><a class="anchor" href="#methods"></a>Methods</h2>
<p>A <code>Dataset</code> has the following methods:</p><ul>
<li><p><code>$NewScan()</code>: Returns a <a href='Scanner.html'>ScannerBuilder</a> for building a query</p></li>
<li><p><code>$schema</code>: Active binding that returns the <a href='Schema.html'>Schema</a> of the Dataset; you
may also replace the dataset's schema by using <code>ds$schema &lt;- new_schema</code>.
This method currently supports only adding, removing, or reordering
fields in the schema: you cannot alter or cast the field types.</p></li>
</ul>
<p><code>FileSystemDataset</code> has the following methods:</p><ul>
<li><p><code>$files</code>: Active binding, returns the files of the <code>FileSystemDataset</code></p></li>
<li><p><code>$format</code>: Active binding, returns the <a href='FileFormat.html'>FileFormat</a> of the <code>FileSystemDataset</code></p></li>
</ul>
<p><code>UnionDataset</code> has the following methods:</p><ul>
<li><p><code>$children</code>: Active binding, returns all child <code>Dataset</code>s.</p></li>
</ul>
<h2 class="hasAnchor" id="see-also"><a class="anchor" href="#see-also"></a>See also</h2>
<div class='dont-index'><p><code><a href='open_dataset.html'>open_dataset()</a></code> for a simple interface to creating a <code>Dataset</code></p></div>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
<nav id="toc" data-toggle="toc" class="sticky-top">
<h2 data-toc-skip>Contents</h2>
</nav>
</div>
</div>
<footer>
<div class="copyright">
<p>Developed by Neal Richardson, Ian Cook, Jonathan Keane, Romain François, Jeroen Ooms, Apache Arrow.</p>
</div>
<div class="pkgdown">
<p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.6.1.</p>
</div>
</footer>
</div>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body>
</html>