blob: ab5d14b4caeaeec1901d6504cc5960104e265d6c [file] [log] [blame]
<!-- Generated by pkgdown: do not edit by hand -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Create a Source for a Dataset — open_source • Arrow R Package</title>
<!-- jquery -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
<!-- Bootstrap -->
<link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.3.7/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
<!-- Font Awesome icons -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.7.1/css/all.min.css" integrity="sha256-nAmazAk6vS34Xqo0BSrTb+abbtFlgsFK7NKSi6o7Y78=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.7.1/css/v4-shims.min.css" integrity="sha256-6qHlizsOWFskGlwVOKuns+D1nB6ssZrHQrNj1wGplHc=" crossorigin="anonymous" />
<!-- clipboard.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
<!-- headroom.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.9.4/headroom.min.js" integrity="sha256-DJFC1kqIhelURkuza0AvYal5RxMtpzLjFhsnVIeuk+U=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.9.4/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script>
<!-- pkgdown -->
<link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script>
<meta property="og:title" content="Create a Source for a Dataset — open_source" />
<meta property="og:description" content="A Dataset can have one or more Sources. A Source contains one or more
Fragments, such as files, of a common storage location, format, and
partitioning. This function helps you construct a Source that you can
pass to open_dataset()." />
<meta name="twitter:card" content="summary" />
<!-- mathjax -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body>
<div class="container template-reference-topic">
<header>
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">Arrow R Package</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.16.0</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="https://arrow.apache.org/">❯❯❯</a>
</li>
<li>
<a href="../articles/arrow.html">Get started</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="../articles/dataset.html">Working with Arrow Datasets and dplyr</a>
</li>
<li>
<a href="../articles/install.html">Installing the Arrow Package on Linux</a>
</li>
</ul>
</li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Project docs
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="https://arrow.apache.org/docs/format/README.html">Specification</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/c_glib">C GLib</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/cpp">C++</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/java">Java</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/js">JavaScript</a>
</li>
<li>
<a href="https://arrow.apache.org/docs/python">Python</a>
</li>
<li>
<a href="../index.html">R</a>
</li>
</ul>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
<li>
<a href="https://github.com/apache/arrow">
<span class="fab fa fab fa-github fa-lg"></span>
</a>
</li>
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header>
<div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>Create a Source for a Dataset</h1>
<small class="dont-index">Source: <a href='https://github.com/apache/arrow/blob/master/R/dataset.R'><code>R/dataset.R</code></a></small>
<div class="hidden name"><code>open_source.Rd</code></div>
</div>
<div class="ref-description">
<p>A <a href='Dataset.html'>Dataset</a> can have one or more <a href='Source.html'>Source</a>s. A <code>Source</code> contains one or more
<code>Fragments</code>, such as files, of a common storage location, format, and
partitioning. This function helps you construct a <code>Source</code> that you can
pass to <code><a href='open_dataset.html'>open_dataset()</a></code>.</p>
</div>
<pre class="usage"><span class='fu'>open_source</span>(
<span class='no'>path</span>,
<span class='kw'>filesystem</span> <span class='kw'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span>(<span class='st'>"auto"</span>, <span class='st'>"local"</span>),
<span class='kw'>format</span> <span class='kw'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span>(<span class='st'>"parquet"</span>, <span class='st'>"arrow"</span>, <span class='st'>"ipc"</span>),
<span class='kw'>partitioning</span> <span class='kw'>=</span> <span class='kw'>NULL</span>,
<span class='kw'>allow_non_existent</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>,
<span class='kw'>recursive</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>,
<span class='no'>...</span>
)</pre>
<h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
<table class="ref-arguments">
<colgroup><col class="name" /><col class="desc" /></colgroup>
<tr>
<th>path</th>
<td><p>A string file path containing data files</p></td>
</tr>
<tr>
<th>filesystem</th>
<td><p>A string identifier for the filesystem corresponding to
<code>path</code>. Currently only "local" is supported.</p></td>
</tr>
<tr>
<th>format</th>
<td><p>A string identifier of the format of the files in <code>path</code>.
Currently supported options are "parquet", "arrow", and "ipc" (an alias for
the Arrow file format)</p></td>
</tr>
<tr>
<th>partitioning</th>
<td><p>One of</p><ul>
<li><p>A <code>Schema</code>, in which case the file paths relative to <code>sources</code> will be
parsed, and path segments will be matched with the schema fields. For
example, <code><a href='Schema.html'>schema(year = int16(), month = int8())</a></code> would create partitions
for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.</p></li>
<li><p>A character vector that defines the field names corresponding to those
path segments (that is, you're providing the names that would correspond
to a <code>Schema</code> but the types will be autodetected)</p></li>
<li><p>A <code>HivePartitioning</code> or <code>HivePartitioningFactory</code>, as returned
by <code><a href='hive_partition.html'>hive_partition()</a></code> which parses explicit or autodetected fields from
Hive-style path segments</p></li>
<li><p><code>NULL</code> for no partitioning</p></li>
</ul></td>
</tr>
<tr>
<th>allow_non_existent</th>
<td><p>logical: is <code>path</code> allowed to not exist? Default
<code>FALSE</code>. See <a href='FileSelector.html'>FileSelector</a>.</p></td>
</tr>
<tr>
<th>recursive</th>
<td><p>logical: should files be discovered in subdirectories of
<code>path</code>? Default <code>TRUE</code>.</p></td>
</tr>
<tr>
<th>...</th>
<td><p>Additional arguments passed to the <a href='FileSystem.html'>FileSystem</a> <code>$create()</code> method</p></td>
</tr>
</table>
<h2 class="hasAnchor" id="value"><a class="anchor" href="#value"></a>Value</h2>
<p>A <code>SourceFactory</code> object. Pass this to <code><a href='open_dataset.html'>open_dataset()</a></code>,
in a list potentially with other <code>SourceFactory</code> objects, to create
a <code>Dataset</code>.</p>
<h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2>
<p>If you only have a single <code>Source</code>, such as a directory containing Parquet
files, you can call <code><a href='open_dataset.html'>open_dataset()</a></code> directly. Use <code>open_source()</code> when you
want to combine different directories, file systems, or file formats.</p>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
<h2>Contents</h2>
<ul class="nav nav-pills nav-stacked">
<li><a href="#arguments">Arguments</a></li>
<li><a href="#value">Value</a></li>
<li><a href="#details">Details</a></li>
</ul>
</div>
</div>
<footer>
<div class="copyright">
<p>Developed by Romain François, Jeroen Ooms, Neal Richardson, Apache Arrow.</p>
</div>
<div class="pkgdown">
<p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.4.1.</p>
</div>
</footer>
</div>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body>
</html>