blob: 9eaaf54cd78e3830d37f757019307b64048c10c6 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Streaming, Serialization, and IPC &mdash; Apache Arrow v1.0.1</title>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/ipc.html" />
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Filesystem Interface" href="filesystems.html" />
<link rel="prev" title="Data Types and In-Memory Data Model" href="data.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home" alt="Documentation Home"> Apache Arrow
</a>
<div class="version">
1.0.1
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Streaming, Serialization, and IPC</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#writing-and-reading-streams">Writing and Reading Streams</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#using-streams">Using streams</a></li>
<li class="toctree-l4"><a class="reference internal" href="#writing-and-reading-random-access-files">Writing and Reading Random Access Files</a></li>
<li class="toctree-l4"><a class="reference internal" href="#reading-from-stream-and-file-format-for-pandas">Reading from Stream and File Format for pandas</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#arbitrary-object-serialization">Arbitrary Object Serialization</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#serializing-custom-data-types">Serializing Custom Data Types</a></li>
<li class="toctree-l4"><a class="reference internal" href="#component-based-serialization">Component-based Serialization</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#serializing-pandas-objects">Serializing pandas Objects</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="filesystems.html">Filesystem Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas.html">Pandas Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li>
<li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="index.html">Python bindings</a> &raquo;</li>
<li>Streaming, Serialization, and IPC</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/python/ipc.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="streaming-serialization-and-ipc">
<span id="ipc"></span><h1>Streaming, Serialization, and IPC<a class="headerlink" href="#streaming-serialization-and-ipc" title="Permalink to this headline">ΒΆ</a></h1>
<div class="section" id="writing-and-reading-streams">
<h2>Writing and Reading Streams<a class="headerlink" href="#writing-and-reading-streams" title="Permalink to this headline">ΒΆ</a></h2>
<p>Arrow defines two types of binary formats for serializing record batches:</p>
<ul class="simple">
<li><p><strong>Streaming format</strong>: for sending an arbitrary length sequence of record
batches. The format must be processed from start to end, and does not support
random access</p></li>
<li><p><strong>File or Random Access format</strong>: for serializing a fixed number of record
batches. Supports random access, and thus is very useful when used with
memory maps</p></li>
</ul>
<p>To follow this section, make sure to first read the section on <a class="reference internal" href="memory.html#io"><span class="std std-ref">Memory and
IO</span></a>.</p>
<div class="section" id="using-streams">
<h3>Using streams<a class="headerlink" href="#using-streams" title="Permalink to this headline">ΒΆ</a></h3>
<p>First, let’s create a small record batch:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span>
<span class="gp">In [2]: </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp"> ...: </span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]),</span>
<span class="gp"> ...: </span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="s1">&#39;foo&#39;</span><span class="p">,</span> <span class="s1">&#39;bar&#39;</span><span class="p">,</span> <span class="s1">&#39;baz&#39;</span><span class="p">,</span> <span class="bp">None</span><span class="p">]),</span>
<span class="gp"> ...: </span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="bp">True</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="bp">False</span><span class="p">,</span> <span class="bp">True</span><span class="p">])</span>
<span class="gp"> ...: </span><span class="p">]</span>
<span class="gp"> ...: </span>
<span class="gp">In [3]: </span><span class="n">batch</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">record_batch</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;f0&#39;</span><span class="p">,</span> <span class="s1">&#39;f1&#39;</span><span class="p">,</span> <span class="s1">&#39;f2&#39;</span><span class="p">])</span>
<span class="gp">In [4]: </span><span class="n">batch</span><span class="o">.</span><span class="n">num_rows</span>
<span class="gh">Out[4]: </span><span class="go">4</span>
<span class="gp">In [5]: </span><span class="n">batch</span><span class="o">.</span><span class="n">num_columns</span>
<span class="gh">Out[5]: </span><span class="go">3</span>
</pre></div>
</div>
<p>Now, we can begin writing a stream containing some number of these batches. For
this we use <code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchStreamWriter</span></code>, which can write to a
writeable <code class="docutils literal notranslate"><span class="pre">NativeFile</span></code> object or a writeable Python object. For convenience,
this one can be created with <a class="reference internal" href="generated/pyarrow.ipc.new_stream.html#pyarrow.ipc.new_stream" title="pyarrow.ipc.new_stream"><code class="xref py py-func docutils literal notranslate"><span class="pre">new_stream()</span></code></a>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [6]: </span><span class="n">sink</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">BufferOutputStream</span><span class="p">()</span>
<span class="gp">In [7]: </span><span class="n">writer</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">new_stream</span><span class="p">(</span><span class="n">sink</span><span class="p">,</span> <span class="n">batch</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span>
</pre></div>
</div>
<p>Here we used an in-memory Arrow buffer stream, but this could have been a
socket or some other IO sink.</p>
<p>When creating the <code class="docutils literal notranslate"><span class="pre">StreamWriter</span></code>, we pass the schema, since the schema
(column names and types) must be the same for all of the batches sent in this
particular stream. Now we can do:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [8]: </span><span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">5</span><span class="p">):</span>
<span class="gp"> ...: </span> <span class="n">writer</span><span class="o">.</span><span class="n">write_batch</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
<span class="gp"> ...: </span>
<span class="gp">In [9]: </span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="gp">In [10]: </span><span class="n">buf</span> <span class="o">=</span> <span class="n">sink</span><span class="o">.</span><span class="n">getvalue</span><span class="p">()</span>
<span class="gp">In [11]: </span><span class="n">buf</span><span class="o">.</span><span class="n">size</span>
<span class="gh">Out[11]: </span><span class="go">1984</span>
</pre></div>
</div>
<p>Now <code class="docutils literal notranslate"><span class="pre">buf</span></code> contains the complete stream as an in-memory byte buffer. We can
read such a stream with <code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchStreamReader</span></code> or the
convenience function <code class="docutils literal notranslate"><span class="pre">pyarrow.ipc.open_stream</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [12]: </span><span class="n">reader</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">open_stream</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span>
<span class="gp">In [13]: </span><span class="n">reader</span><span class="o">.</span><span class="n">schema</span>
<span class="gh">Out[13]: </span><span class="go"></span>
<span class="go">f0: int64</span>
<span class="go">f1: string</span>
<span class="go">f2: bool</span>
<span class="gp">In [14]: </span><span class="n">batches</span> <span class="o">=</span> <span class="p">[</span><span class="n">b</span> <span class="k">for</span> <span class="n">b</span> <span class="ow">in</span> <span class="n">reader</span><span class="p">]</span>
<span class="gp">In [15]: </span><span class="nb">len</span><span class="p">(</span><span class="n">batches</span><span class="p">)</span>
<span class="gh">Out[15]: </span><span class="go">5</span>
</pre></div>
</div>
<p>We can check the returned batches are the same as the original input:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [16]: </span><span class="n">batches</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
<span class="gh">Out[16]: </span><span class="go">True</span>
</pre></div>
</div>
<p>An important point is that if the input source supports zero-copy reads
(e.g. like a memory map, or <code class="docutils literal notranslate"><span class="pre">pyarrow.BufferReader</span></code>), then the returned
batches are also zero-copy and do not allocate any new memory on read.</p>
</div>
<div class="section" id="writing-and-reading-random-access-files">
<h3>Writing and Reading Random Access Files<a class="headerlink" href="#writing-and-reading-random-access-files" title="Permalink to this headline">ΒΆ</a></h3>
<p>The <code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchFileWriter</span></code> has the same API as
<code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchStreamWriter</span></code>. You can create one with
<a class="reference internal" href="generated/pyarrow.ipc.new_file.html#pyarrow.ipc.new_file" title="pyarrow.ipc.new_file"><code class="xref py py-func docutils literal notranslate"><span class="pre">new_file()</span></code></a>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="n">sink</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">BufferOutputStream</span><span class="p">()</span>
<span class="gp">In [18]: </span><span class="n">writer</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">new_file</span><span class="p">(</span><span class="n">sink</span><span class="p">,</span> <span class="n">batch</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span>
<span class="gp">In [19]: </span><span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">):</span>
<span class="gp"> ....: </span> <span class="n">writer</span><span class="o">.</span><span class="n">write_batch</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
<span class="gp"> ....: </span>
<span class="gp">In [20]: </span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="gp">In [21]: </span><span class="n">buf</span> <span class="o">=</span> <span class="n">sink</span><span class="o">.</span><span class="n">getvalue</span><span class="p">()</span>
<span class="gp">In [22]: </span><span class="n">buf</span><span class="o">.</span><span class="n">size</span>
<span class="gh">Out[22]: </span><span class="go">4226</span>
</pre></div>
</div>
<p>The difference between <code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchFileReader</span></code> and
<code class="xref py py-class docutils literal notranslate"><span class="pre">RecordBatchStreamReader</span></code> is that the input source must have a
<code class="docutils literal notranslate"><span class="pre">seek</span></code> method for random access. The stream reader only requires read
operations. We can also use the <a class="reference internal" href="generated/pyarrow.ipc.open_file.html#pyarrow.ipc.open_file" title="pyarrow.ipc.open_file"><code class="xref py py-func docutils literal notranslate"><span class="pre">open_file()</span></code></a> method to open a file:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="n">reader</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">open_file</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span>
</pre></div>
</div>
<p>Because we have access to the entire payload, we know the number of record
batches in the file, and can read any at random:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [24]: </span><span class="n">reader</span><span class="o">.</span><span class="n">num_record_batches</span>
<span class="gh">Out[24]: </span><span class="go">10</span>
<span class="gp">In [25]: </span><span class="n">b</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">get_batch</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gp">In [26]: </span><span class="n">b</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
<span class="gh">Out[26]: </span><span class="go">True</span>
</pre></div>
</div>
</div>
<div class="section" id="reading-from-stream-and-file-format-for-pandas">
<h3>Reading from Stream and File Format for pandas<a class="headerlink" href="#reading-from-stream-and-file-format-for-pandas" title="Permalink to this headline">ΒΆ</a></h3>
<p>The stream and file reader classes have a special <code class="docutils literal notranslate"><span class="pre">read_pandas</span></code> method to
simplify reading multiple record batches and converting them to a single
DataFrame output:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [27]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">open_file</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span><span class="o">.</span><span class="n">read_pandas</span><span class="p">()</span>
<span class="gp">In [28]: </span><span class="n">df</span><span class="p">[:</span><span class="mi">5</span><span class="p">]</span>
<span class="gh">Out[28]: </span><span class="go"></span>
<span class="go"> f0 f1 f2</span>
<span class="go">0 1 foo True</span>
<span class="go">1 2 bar None</span>
<span class="go">2 3 baz False</span>
<span class="go">3 4 None True</span>
<span class="go">4 1 foo True</span>
</pre></div>
</div>
</div>
</div>
<div class="section" id="arbitrary-object-serialization">
<h2>Arbitrary Object Serialization<a class="headerlink" href="#arbitrary-object-serialization" title="Permalink to this headline">ΒΆ</a></h2>
<p>In <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> we are able to serialize and deserialize many kinds of Python
objects. While not a complete replacement for the <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module, these
functions can be significantly faster, particular when dealing with collections
of NumPy arrays.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>While the functions in this section utilize the Arrow stream protocol
internally, they do not produce data that is compatible with the above
<code class="docutils literal notranslate"><span class="pre">ipc.open_file</span></code> and <code class="docutils literal notranslate"><span class="pre">ipc.open_stream</span></code> functions.</p>
</div>
<p>As an example, consider a dictionary containing NumPy arrays:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [29]: </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="gp">In [30]: </span><span class="n">data</span> <span class="o">=</span> <span class="p">{</span>
<span class="gp"> ....: </span> <span class="n">i</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">500</span><span class="p">,</span> <span class="mi">500</span><span class="p">)</span>
<span class="gp"> ....: </span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="gp"> ....: </span><span class="p">}</span>
<span class="gp"> ....: </span>
</pre></div>
</div>
<p>We use the <code class="docutils literal notranslate"><span class="pre">pyarrow.serialize</span></code> function to convert this data to a byte
buffer:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [31]: </span><span class="n">buf</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">serialize</span><span class="p">(</span><span class="n">data</span><span class="p">)</span><span class="o">.</span><span class="n">to_buffer</span><span class="p">()</span>
<span class="gp">In [32]: </span><span class="nb">type</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span>
<span class="gh">Out[32]: </span><span class="go">pyarrow.lib.Buffer</span>
<span class="gp">In [33]: </span><span class="n">buf</span><span class="o">.</span><span class="n">size</span>
<span class="gh">Out[33]: </span><span class="go">200028928</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">pyarrow.serialize</span></code> creates an intermediate object which can be converted to
a buffer (the <code class="docutils literal notranslate"><span class="pre">to_buffer</span></code> method) or written directly to an output stream.</p>
<p><code class="docutils literal notranslate"><span class="pre">pyarrow.deserialize</span></code> converts a buffer-like object back to the original
Python object:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [34]: </span><span class="n">restored_data</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">deserialize</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span>
<span class="gp">In [35]: </span><span class="n">restored_data</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="gh">Out[35]: </span><span class="go"></span>
<span class="go">array([[ 0.31908496, -0.38497429, -0.00908503, ..., 0.59388712,</span>
<span class="go"> -0.4781306 , 0.05585997],</span>
<span class="go"> [ 0.17211542, 0.36950389, 1.65803031, ..., 0.13289332,</span>
<span class="go"> 0.81641384, -1.10876572],</span>
<span class="go"> [-0.28750533, 0.02887317, -1.16949495, ..., 0.68167732,</span>
<span class="go"> -0.94371386, 0.15949882],</span>
<span class="go"> ...,</span>
<span class="go"> [ 1.40029811, 0.4470198 , 1.55334769, ..., 1.96692071,</span>
<span class="go"> -0.2367957 , -1.20272686],</span>
<span class="go"> [-2.01700399, -1.10887375, 1.50930246, ..., 1.75533091,</span>
<span class="go"> 0.86458875, 0.0209724 ],</span>
<span class="go"> [-1.01071012, 0.7951678 , 1.43853346, ..., -0.84843836,</span>
<span class="go"> -1.50469422, -0.06000656]])</span>
</pre></div>
</div>
<p>When dealing with NumPy arrays, <code class="docutils literal notranslate"><span class="pre">pyarrow.deserialize</span></code> can be significantly
faster than <code class="docutils literal notranslate"><span class="pre">pickle</span></code> because the resulting arrays are zero-copy references
into the input buffer. The larger the arrays, the larger the performance
savings.</p>
<p>Consider this example, we have for <code class="docutils literal notranslate"><span class="pre">pyarrow.deserialize</span></code></p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [36]: </span><span class="o">%</span><span class="k">timeit</span> restored_data = pa.deserialize(buf)
<span class="go">8.81 ms +- 10.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)</span>
</pre></div>
</div>
<p>And for pickle:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [37]: </span><span class="kn">import</span> <span class="nn">pickle</span>
<span class="gp">In [38]: </span><span class="n">pickled</span> <span class="o">=</span> <span class="n">pickle</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">In [39]: </span><span class="o">%</span><span class="k">timeit</span> unpickled_data = pickle.loads(pickled)
<span class="go">74.7 ms +- 67.9 us per loop (mean +- std. dev. of 7 runs, 10 loops each)</span>
</pre></div>
</div>
<p>We aspire to make these functions a high-speed alternative to pickle for
transient serialization in Python big data applications.</p>
<div class="section" id="serializing-custom-data-types">
<h3>Serializing Custom Data Types<a class="headerlink" href="#serializing-custom-data-types" title="Permalink to this headline">ΒΆ</a></h3>
<p>If an unrecognized data type is encountered when serializing an object,
<code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> will fall back on using <code class="docutils literal notranslate"><span class="pre">pickle</span></code> for converting that type to a
byte string. There may be a more efficient way, though.</p>
<p>Consider a class with two members, one of which is a NumPy array:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">MyData</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">data</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">data</span>
</pre></div>
</div>
<p>We write functions to convert this to and from a dictionary with simpler types:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">_serialize_MyData</span><span class="p">(</span><span class="n">val</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span><span class="s1">&#39;name&#39;</span><span class="p">:</span> <span class="n">val</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="s1">&#39;data&#39;</span><span class="p">:</span> <span class="n">val</span><span class="o">.</span><span class="n">data</span><span class="p">}</span>
<span class="k">def</span> <span class="nf">_deserialize_MyData</span><span class="p">(</span><span class="n">data</span><span class="p">):</span>
<span class="k">return</span> <span class="n">MyData</span><span class="p">(</span><span class="n">data</span><span class="p">[</span><span class="s1">&#39;name&#39;</span><span class="p">],</span> <span class="n">data</span><span class="p">[</span><span class="s1">&#39;data&#39;</span><span class="p">]</span>
</pre></div>
</div>
<p>then, we must register these functions in a <code class="docutils literal notranslate"><span class="pre">SerializationContext</span></code> so that
<code class="docutils literal notranslate"><span class="pre">MyData</span></code> can be recognized:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">context</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">SerializationContext</span><span class="p">()</span>
<span class="n">context</span><span class="o">.</span><span class="n">register_type</span><span class="p">(</span><span class="n">MyData</span><span class="p">,</span> <span class="s1">&#39;MyData&#39;</span><span class="p">,</span>
<span class="n">custom_serializer</span><span class="o">=</span><span class="n">_serialize_MyData</span><span class="p">,</span>
<span class="n">custom_deserializer</span><span class="o">=</span><span class="n">_deserialize_MyData</span><span class="p">)</span>
</pre></div>
</div>
<p>Lastly, we use this context as an additional argument to <code class="docutils literal notranslate"><span class="pre">pyarrow.serialize</span></code>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">buf</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">serialize</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">context</span><span class="o">=</span><span class="n">context</span><span class="p">)</span><span class="o">.</span><span class="n">to_buffer</span><span class="p">()</span>
<span class="n">restored_val</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">deserialize</span><span class="p">(</span><span class="n">buf</span><span class="p">,</span> <span class="n">context</span><span class="o">=</span><span class="n">context</span><span class="p">)</span>
</pre></div>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">SerializationContext</span></code> also has convenience methods <code class="docutils literal notranslate"><span class="pre">serialize</span></code> and
<code class="docutils literal notranslate"><span class="pre">deserialize</span></code>, so these are equivalent statements:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">buf</span> <span class="o">=</span> <span class="n">context</span><span class="o">.</span><span class="n">serialize</span><span class="p">(</span><span class="n">val</span><span class="p">)</span><span class="o">.</span><span class="n">to_buffer</span><span class="p">()</span>
<span class="n">restored_val</span> <span class="o">=</span> <span class="n">context</span><span class="o">.</span><span class="n">deserialize</span><span class="p">(</span><span class="n">buf</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="component-based-serialization">
<h3>Component-based Serialization<a class="headerlink" href="#component-based-serialization" title="Permalink to this headline">ΒΆ</a></h3>
<p>For serializing Python objects containing some number of NumPy arrays, Arrow
buffers, or other data types, it may be desirable to transport their serialized
representation without having to produce an intermediate copy using the
<code class="docutils literal notranslate"><span class="pre">to_buffer</span></code> method. To motivate this, suppose we have a list of NumPy arrays:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [40]: </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
<span class="gp">In [41]: </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">5</span><span class="p">)]</span>
</pre></div>
</div>
<p>The call <code class="docutils literal notranslate"><span class="pre">pa.serialize(data)</span></code> does not copy the memory inside each of these
NumPy arrays. This serialized representation can be then decomposed into a
dictionary containing a sequence of <code class="docutils literal notranslate"><span class="pre">pyarrow.Buffer</span></code> objects containing
metadata for each array and references to the memory inside the arrays. To do
this, use the <code class="docutils literal notranslate"><span class="pre">to_components</span></code> method:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [42]: </span><span class="n">serialized</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">serialize</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">In [43]: </span><span class="n">components</span> <span class="o">=</span> <span class="n">serialized</span><span class="o">.</span><span class="n">to_components</span><span class="p">()</span>
</pre></div>
</div>
<p>The particular details of the output of <code class="docutils literal notranslate"><span class="pre">to_components</span></code> are not too
important. The objects in the <code class="docutils literal notranslate"><span class="pre">'data'</span></code> field are <code class="docutils literal notranslate"><span class="pre">pyarrow.Buffer</span></code> objects,
which are zero-copy convertible to Python <code class="docutils literal notranslate"><span class="pre">memoryview</span></code> objects:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [44]: </span><span class="n">memoryview</span><span class="p">(</span><span class="n">components</span><span class="p">[</span><span class="s1">&#39;data&#39;</span><span class="p">][</span><span class="mi">0</span><span class="p">])</span>
<span class="gh">Out[44]: </span><span class="go">&lt;memory at 0x7f2f699cda08&gt;</span>
</pre></div>
</div>
<p>A memoryview can be converted back to a Arrow <code class="docutils literal notranslate"><span class="pre">Buffer</span></code> with
<code class="docutils literal notranslate"><span class="pre">pyarrow.py_buffer</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [45]: </span><span class="n">mv</span> <span class="o">=</span> <span class="n">memoryview</span><span class="p">(</span><span class="n">components</span><span class="p">[</span><span class="s1">&#39;data&#39;</span><span class="p">][</span><span class="mi">0</span><span class="p">])</span>
<span class="gp">In [46]: </span><span class="n">buf</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">py_buffer</span><span class="p">(</span><span class="n">mv</span><span class="p">)</span>
</pre></div>
</div>
<p>An object can be reconstructed from its component-based representation using
<code class="docutils literal notranslate"><span class="pre">deserialize_components</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [47]: </span><span class="n">restored_data</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">deserialize_components</span><span class="p">(</span><span class="n">components</span><span class="p">)</span>
<span class="gp">In [48]: </span><span class="n">restored_data</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="gh">Out[48]: </span><span class="go"></span>
<span class="go">array([[ 0.08272937, -1.62435083, -0.05553385, -0.17009072, -1.42342936,</span>
<span class="go"> -2.14976376, 2.08674725, -1.13165575, 0.35196661, 0.30062211],</span>
<span class="go"> [-1.64855238, 2.15327101, 0.24985174, 0.73395161, 0.33972162,</span>
<span class="go"> -0.14478569, 1.10654142, 0.67399034, 0.16480965, -1.40790188],</span>
<span class="go"> [ 1.52062309, 0.85876101, -0.78834157, 1.50678204, 1.21548628,</span>
<span class="go"> -0.768725 , 1.29781535, -2.04542496, 0.08160128, 0.92438068],</span>
<span class="go"> [-0.27505366, -0.96845967, -0.73970018, -2.03152045, -0.25347227,</span>
<span class="go"> -0.09009048, -1.33786189, 0.47357543, -0.66451895, -0.7304969 ],</span>
<span class="go"> [-1.53565458, 0.17464885, 1.1882689 , 1.39913389, -2.55505597,</span>
<span class="go"> -0.99447036, -1.3056554 , 0.63831016, 0.44225673, 0.1876823 ],</span>
<span class="go"> [ 2.03372544, -0.87916257, -0.65075426, 0.2109289 , -0.82366568,</span>
<span class="go"> -0.4311721 , 0.01720903, 2.31279004, 0.44759669, 0.50426095],</span>
<span class="go"> [ 0.74985502, -0.87754096, -1.84156315, 0.96375487, 0.0179234 ,</span>
<span class="go"> -0.20506921, 0.48606781, 0.80249654, -0.82538998, 0.15040379],</span>
<span class="go"> [ 0.29645408, 0.33675047, -0.36390788, -2.28558262, -0.76621032,</span>
<span class="go"> -1.40421301, -1.55362434, 0.1556816 , -0.26782507, -0.65797254],</span>
<span class="go"> [-0.4841557 , 0.04571714, 0.3263099 , -0.07581112, -0.16210661,</span>
<span class="go"> -0.35836661, 0.88125115, 0.33318043, 0.72369647, 0.59428462],</span>
<span class="go"> [-0.03647698, 0.71456644, 0.09034961, -1.17933729, -1.03682108,</span>
<span class="go"> -0.65117059, 1.80962159, -0.6355785 , 0.44556374, -0.19257742]])</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">deserialize_components</span></code> is also available as a method on
<code class="docutils literal notranslate"><span class="pre">SerializationContext</span></code> objects.</p>
</div>
</div>
<div class="section" id="serializing-pandas-objects">
<h2>Serializing pandas Objects<a class="headerlink" href="#serializing-pandas-objects" title="Permalink to this headline">ΒΆ</a></h2>
<p>The default serialization context has optimized handling of pandas
objects like <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> and <code class="docutils literal notranslate"><span class="pre">Series</span></code>. Combined with component-based
serialization above, this enables zero-copy transport of pandas DataFrame
objects not containing any Python objects:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [49]: </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="gp">In [50]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;a&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">]})</span>
<span class="gp">In [51]: </span><span class="n">context</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">default_serialization_context</span><span class="p">()</span>
<span class="gp">In [52]: </span><span class="n">serialized_df</span> <span class="o">=</span> <span class="n">context</span><span class="o">.</span><span class="n">serialize</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">In [53]: </span><span class="n">df_components</span> <span class="o">=</span> <span class="n">serialized_df</span><span class="o">.</span><span class="n">to_components</span><span class="p">()</span>
<span class="gp">In [54]: </span><span class="n">original_df</span> <span class="o">=</span> <span class="n">context</span><span class="o">.</span><span class="n">deserialize_components</span><span class="p">(</span><span class="n">df_components</span><span class="p">)</span>
<span class="gp">In [55]: </span><span class="n">original_df</span>
<span class="gh">Out[55]: </span><span class="go"></span>
<span class="go"> a</span>
<span class="go">0 1</span>
<span class="go">1 2</span>
<span class="go">2 3</span>
<span class="go">3 4</span>
<span class="go">4 5</span>
</pre></div>
</div>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="filesystems.html" class="btn btn-neutral float-right" title="Filesystem Interface" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="data.html" class="btn btn-neutral float-left" title="Data Types and In-Memory Data Model" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright 2016-2019 Apache Software Foundation
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body>
</html>