blob: 247865ae056db3858af69aae1a808aa1167b6420 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Filesystem Interface &mdash; Apache Arrow v3.0.0</title>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/theme_overrides.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/filesystems.html" />
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Filesystem Interface (legacy)" href="filesystems_deprecated.html" />
<link rel="prev" title="Streaming, Serialization, and IPC" href="ipc.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home"> Apache Arrow
</a>
<div class="version">
3.0.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CStreamInterface.html">The Arrow C stream interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/julia/Arrow/README.md">Julia</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2"><a class="reference internal" href="compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="ipc.html">Streaming, Serialization, and IPC</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Filesystem Interface</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#usage">Usage</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#reading-and-writing-files">Reading and writing files</a></li>
<li class="toctree-l4"><a class="reference internal" href="#listing-files">Listing files</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#s3">S3</a></li>
<li class="toctree-l3"><a class="reference internal" href="#hadoop-file-system-hdfs">Hadoop File System (HDFS)</a></li>
<li class="toctree-l3"><a class="reference internal" href="#using-fsspec-compatible-filesystems">Using fsspec-compatible filesystems</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas.html">Pandas Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li>
<li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="index.html">Python bindings</a> &raquo;</li>
<li>Filesystem Interface</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/python/filesystems.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="filesystem-interface">
<span id="filesystem"></span><h1>Filesystem Interface<a class="headerlink" href="#filesystem-interface" title="Permalink to this headline"></a></h1>
<p>PyArrow comes with an abstract filesystem interface, as well as concrete
implementations for various storage types.</p>
<p>The filesystem interface provides input and output streams as well as
directory operations. A simplified view of the underlying data
storage is exposed. Data paths are represented as <em>abstract paths</em>, which
are <code class="docutils literal notranslate"><span class="pre">/</span></code>-separated, even on Windows, and shouldn’t include special path
components such as <code class="docutils literal notranslate"><span class="pre">.</span></code> and <code class="docutils literal notranslate"><span class="pre">..</span></code>. Symbolic links, if supported by the
underlying storage, are automatically dereferenced. Only basic
<a class="reference internal" href="generated/pyarrow.fs.FileInfo.html#pyarrow.fs.FileInfo" title="pyarrow.fs.FileInfo"><code class="xref py py-class docutils literal notranslate"><span class="pre">metadata</span></code></a> about file entries, such as the file size
and modification time, is made available.</p>
<p>The core interface is represented by the base class <a class="reference internal" href="generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem" title="pyarrow.fs.FileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileSystem</span></code></a>.
Concrete subclasses are available for various kinds of storage, such as local
filesystem access (<a class="reference internal" href="generated/pyarrow.fs.LocalFileSystem.html#pyarrow.fs.LocalFileSystem" title="pyarrow.fs.LocalFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">LocalFileSystem</span></code></a>), HDFS (<a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a>)
and Amazon S3-compatible storage (<code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code>).</p>
<div class="section" id="usage">
<h2>Usage<a class="headerlink" href="#usage" title="Permalink to this headline"></a></h2>
<p>A FileSystem object can be created with one of the constructors (and check the
respective constructor for its options):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="k">import</span> <span class="n">fs</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">local</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()</span>
</pre></div>
</div>
<p>or alternatively inferred from a URI:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s3</span><span class="p">,</span> <span class="n">path</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">FileSystem</span><span class="o">.</span><span class="n">from_uri</span><span class="p">(</span><span class="s2">&quot;s3://my-bucket&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">s3</span>
<span class="go">&lt;pyarrow._s3fs.S3FileSystem at 0x7f6760cbf4f0&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span>
<span class="go">&#39;my-bucket&#39;</span>
</pre></div>
</div>
<div class="section" id="reading-and-writing-files">
<h3>Reading and writing files<a class="headerlink" href="#reading-and-writing-files" title="Permalink to this headline"></a></h3>
<p>Several of the IO-related functions in PyArrow accept either a URI (and infer
the filesystem) or an explicit <code class="docutils literal notranslate"><span class="pre">filesystem</span></code> argument to specify the filesystem
to read or write from. For example, the <a class="reference internal" href="generated/pyarrow.parquet.read_table.html#pyarrow.parquet.read_table" title="pyarrow.parquet.read_table"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.parquet.read_table()</span></code></a>
function can be used in the following ways:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># using a URI -&gt; filesystem is inferred</span>
<span class="n">pq</span><span class="o">.</span><span class="n">read_table</span><span class="p">(</span><span class="s2">&quot;s3://my-bucket/data.parquet&quot;</span><span class="p">)</span>
<span class="c1"># using a path and filesystem</span>
<span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="o">..</span><span class="p">)</span>
<span class="n">pq</span><span class="o">.</span><span class="n">read_table</span><span class="p">(</span><span class="s2">&quot;my-bucket/data.parquet&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">s3</span><span class="p">)</span>
</pre></div>
</div>
<p>The filesystem interface further allows to open files for reading (input) or
writing (output) directly, which can be combined with functions that work with
file-like objects. For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">local</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()</span>
<span class="k">with</span> <span class="n">local</span><span class="o">.</span><span class="n">open_output_stream</span><span class="p">(</span><span class="s2">&quot;test.arrow&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
<span class="k">with</span> <span class="n">pa</span><span class="o">.</span><span class="n">RecordBatchFileWriter</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span>
<span class="n">writer</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="listing-files">
<h3>Listing files<a class="headerlink" href="#listing-files" title="Permalink to this headline"></a></h3>
<p>Inspecting the directories and files on a filesystem can be done with the
<a class="reference internal" href="generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.get_file_info" title="pyarrow.fs.FileSystem.get_file_info"><code class="xref py py-meth docutils literal notranslate"><span class="pre">FileSystem.get_file_info()</span></code></a> method. To list the contents of a directory,
use the <a class="reference internal" href="generated/pyarrow.fs.FileSelector.html#pyarrow.fs.FileSelector" title="pyarrow.fs.FileSelector"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileSelector</span></code></a> object to specify the selection:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">local</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">fs</span><span class="o">.</span><span class="n">FileSelector</span><span class="p">(</span><span class="s2">&quot;dataset/&quot;</span><span class="p">,</span> <span class="n">recursive</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
<span class="go">[&lt;FileInfo for &#39;dataset/part=B&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;dataset/part=B/data0.parquet&#39;: type=FileType.File, size=1564&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;dataset/part=A&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;dataset/part=A/data0.parquet&#39;: type=FileType.File, size=1564&gt;]</span>
</pre></div>
</div>
<p>This returns a list of <a class="reference internal" href="generated/pyarrow.fs.FileInfo.html#pyarrow.fs.FileInfo" title="pyarrow.fs.FileInfo"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileInfo</span></code></a> objects, containing information about
the type (file or directory), the size, the date last modified, etc.</p>
<p>You can also get this information for a single explicit path (or list of
paths):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">local</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="s1">&#39;test.arrow&#39;</span><span class="p">)</span>
<span class="go">&lt;FileInfo for &#39;test.arrow&#39;: type=FileType.File, size=3250&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">local</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="s1">&#39;non_existent&#39;</span><span class="p">)</span>
<span class="go">&lt;FileInfo for &#39;non_existent&#39;: type=FileType.NotFound&gt;</span>
</pre></div>
</div>
</div>
</div>
<div class="section" id="s3">
<h2>S3<a class="headerlink" href="#s3" title="Permalink to this headline"></a></h2>
<p>The <code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code> constructor has several options to configure the S3
connection (e.g. credentials, the region, an endpoint override, etc). In
addition, the constructor will also inspect configured S3 credentials as
supported by AWS (for example the <code class="docutils literal notranslate"><span class="pre">AWS_ACCESS_KEY_ID</span></code> and
<code class="docutils literal notranslate"><span class="pre">AWS_SECRET_ACCESS_KEY</span></code> environment variables).</p>
<p>Example how you can read contents from a S3 bucket:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="k">import</span> <span class="n">fs</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">region</span><span class="o">=</span><span class="s1">&#39;eu-west-3&#39;</span><span class="p">)</span>
<span class="go"># List all contents in a bucket, recursively</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">s3</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">fs</span><span class="o">.</span><span class="n">FileSelector</span><span class="p">(</span><span class="s1">&#39;my-test-bucket&#39;</span><span class="p">,</span> <span class="n">recursive</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
<span class="go">[&lt;FileInfo for &#39;my-test-bucket/File1&#39;: type=FileType.File, size=10&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/File5&#39;: type=FileType.File, size=10&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir1&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir2&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/EmptyDir&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir1/File2&#39;: type=FileType.File, size=11&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir1/Subdir&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir2/Subdir&#39;: type=FileType.Directory&gt;,</span>
<span class="go"> &lt;FileInfo for &#39;my-test-bucket/Dir2/Subdir/File3&#39;: type=FileType.File, size=10&gt;]</span>
<span class="go"># Open a file for reading and download its contents</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">f</span> <span class="o">=</span> <span class="n">s3</span><span class="o">.</span><span class="n">open_input_stream</span><span class="p">(</span><span class="s1">&#39;my-test-bucket/Dir1/File2&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">f</span><span class="o">.</span><span class="n">readall</span><span class="p">()</span>
<span class="go">b&#39;some data&#39;</span>
</pre></div>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p>See the <a class="reference external" href="https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html">AWS docs</a>
for the different ways to configure the AWS credentials.</p>
</div>
</div>
<div class="section" id="hadoop-file-system-hdfs">
<h2>Hadoop File System (HDFS)<a class="headerlink" href="#hadoop-file-system-hdfs" title="Permalink to this headline"></a></h2>
<p>PyArrow comes with bindings to the Hadoop File System (based on C++ bindings
using <code class="docutils literal notranslate"><span class="pre">libhdfs</span></code>, a JNI-based interface to the Java Hadoop client). You connect
using the <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a> constructor:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="n">hdfs</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">HadoopFileSystem</span><span class="p">(</span><span class="n">host</span><span class="p">,</span> <span class="n">port</span><span class="p">,</span> <span class="n">user</span><span class="o">=</span><span class="n">user</span><span class="p">,</span> <span class="n">kerb_ticket</span><span class="o">=</span><span class="n">ticket_cache_path</span><span class="p">)</span>
</pre></div>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">libhdfs</span></code> library is loaded <strong>at runtime</strong> (rather than at link / library
load time, since the library may not be in your LD_LIBRARY_PATH), and relies on
some environment variables.</p>
<ul>
<li><p><code class="docutils literal notranslate"><span class="pre">HADOOP_HOME</span></code>: the root of your installed Hadoop distribution. Often has
<cite>lib/native/libhdfs.so</cite>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code>: the location of your Java SDK installation.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">ARROW_LIBHDFS_DIR</span></code> (optional): explicit location of <code class="docutils literal notranslate"><span class="pre">libhdfs.so</span></code> if it is
installed somewhere other than <code class="docutils literal notranslate"><span class="pre">$HADOOP_HOME/lib/native</span></code>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">CLASSPATH</span></code>: must contain the Hadoop jars. You can set these using:</p>
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">CLASSPATH</span><span class="o">=</span><span class="sb">`</span><span class="nv">$HADOOP_HOME</span>/bin/hdfs classpath --glob<span class="sb">`</span>
</pre></div>
</div>
<p>If <code class="docutils literal notranslate"><span class="pre">CLASSPATH</span></code> is not set, then it will be set automatically if the
<code class="docutils literal notranslate"><span class="pre">hadoop</span></code> executable is in your system path, or if <code class="docutils literal notranslate"><span class="pre">HADOOP_HOME</span></code> is set.</p>
</li>
</ul>
</div>
<div class="section" id="using-fsspec-compatible-filesystems">
<h2>Using fsspec-compatible filesystems<a class="headerlink" href="#using-fsspec-compatible-filesystems" title="Permalink to this headline"></a></h2>
<p>The filesystems mentioned above are natively supported by Arrow C++ / PyArrow.
The Python ecosystem, however, also has several filesystem packages. Those
packages following the
<a class="reference external" href="https://filesystem-spec.readthedocs.io/en/latest/">fsspec</a> interface can be
used in PyArrow as well.</p>
<p>Functions accepting a filesystem object will also accept an fsspec subclass.
For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># creating an fsspec-based filesystem object for Google Cloud Storage</span>
<span class="kn">import</span> <span class="nn">gcsfs</span>
<span class="n">fs</span> <span class="o">=</span> <span class="n">gcsfs</span><span class="o">.</span><span class="n">GCSFileSystem</span><span class="p">(</span><span class="n">project</span><span class="o">=</span><span class="s1">&#39;my-google-project&#39;</span><span class="p">)</span>
<span class="c1"># using this to read a partitioned dataset</span>
<span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="k">as</span> <span class="nn">ds</span>
<span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;data/&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">fs</span><span class="p">)</span>
</pre></div>
</div>
<p>Under the hood, the fsspec filesystem object is wrapped into a python-based
PyArrow filesystem (<a class="reference internal" href="generated/pyarrow.fs.PyFileSystem.html#pyarrow.fs.PyFileSystem" title="pyarrow.fs.PyFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">PyFileSystem</span></code></a>) using <a class="reference internal" href="generated/pyarrow.fs.FSSpecHandler.html#pyarrow.fs.FSSpecHandler" title="pyarrow.fs.FSSpecHandler"><code class="xref py py-class docutils literal notranslate"><span class="pre">FSSpecHandler</span></code></a>.
You can also manually do this to get an object with the PyArrow FileSystem
interface:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="k">import</span> <span class="n">PyFileSystem</span><span class="p">,</span> <span class="n">FSSpecHandler</span>
<span class="n">pa_fs</span> <span class="o">=</span> <span class="n">PyFileSystem</span><span class="p">(</span><span class="n">FSSpecHandler</span><span class="p">(</span><span class="n">fs</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="filesystems_deprecated.html" class="btn btn-neutral float-right" title="Filesystem Interface (legacy)" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
<a href="ipc.html" class="btn btn-neutral float-left" title="Streaming, Serialization, and IPC" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&#169; Copyright 2016-2019 Apache Software Foundation.
</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body>
</html>