blob: a4500e90c4954f92744b2f4b8f3ff743f9c17151 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Pandas Integration &mdash; Apache Arrow v3.0.0</title>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/theme_overrides.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/pandas.html" />
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Timestamps" href="timestamps.html" />
<link rel="prev" title="NumPy Integration" href="numpy.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home"> Apache Arrow
</a>
<div class="version">
3.0.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CStreamInterface.html">The Arrow C stream interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/julia/Arrow/README.md">Julia</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2"><a class="reference internal" href="compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="ipc.html">Streaming, Serialization, and IPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="filesystems.html">Filesystem Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Pandas Integration</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#dataframes">DataFrames</a></li>
<li class="toctree-l3"><a class="reference internal" href="#series">Series</a></li>
<li class="toctree-l3"><a class="reference internal" href="#handling-pandas-indexes">Handling pandas Indexes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#type-differences">Type differences</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#pandas-arrow-conversion">pandas -&gt; Arrow Conversion</a></li>
<li class="toctree-l4"><a class="reference internal" href="#arrow-pandas-conversion">Arrow -&gt; pandas Conversion</a></li>
<li class="toctree-l4"><a class="reference internal" href="#categorical-types">Categorical types</a></li>
<li class="toctree-l4"><a class="reference internal" href="#datetime-timestamp-types">Datetime (Timestamp) types</a></li>
<li class="toctree-l4"><a class="reference internal" href="#date-types">Date types</a></li>
<li class="toctree-l4"><a class="reference internal" href="#time-types">Time types</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#memory-usage-and-zero-copy">Memory Usage and Zero Copy</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#zero-copy-series-conversions">Zero Copy Series Conversions</a></li>
<li class="toctree-l4"><a class="reference internal" href="#reducing-memory-use-in-table-to-pandas">Reducing Memory Use in <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code></a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li>
<li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="index.html">Python bindings</a> &raquo;</li>
<li>Pandas Integration</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/python/pandas.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="pandas-integration">
<span id="pandas-interop"></span><h1>Pandas Integration<a class="headerlink" href="#pandas-integration" title="Permalink to this headline"></a></h1>
<p>To interface with <a class="reference external" href="https://pandas.pydata.org/">pandas</a>, PyArrow provides
various conversion routines to consume pandas structures and convert back
to them.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>While pandas uses NumPy as a backend, it has enough peculiarities
(such as a different type system, and support for null values) that this
is a separate topic from <a class="reference internal" href="numpy.html#numpy-interop"><span class="std std-ref">NumPy Integration</span></a>.</p>
</div>
<p>To follow examples in this document, make sure to run:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span>
</pre></div>
</div>
<div class="section" id="dataframes">
<h2>DataFrames<a class="headerlink" href="#dataframes" title="Permalink to this headline"></a></h2>
<p>The equivalent to a pandas DataFrame in Arrow is a <a class="reference internal" href="data.html#data-table"><span class="std std-ref">Table</span></a>.
Both consist of a set of named columns of equal length. While pandas only
supports flat columns, the Table also provides nested columns, thus it can
represent more data than a DataFrame, so a full conversion is not always possible.</p>
<p>Conversion from a Table to a DataFrame is done by calling
<a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.to_pandas" title="pyarrow.Table.to_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.to_pandas()</span></code></a>. The inverse is then achieved by using
<a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.from_pandas" title="pyarrow.Table.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_pandas()</span></code></a>.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;a&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]})</span>
<span class="c1"># Convert from pandas to Arrow</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="c1"># Convert back to pandas</span>
<span class="n">df_new</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="c1"># Infer Arrow schema from pandas</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Schema</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
</pre></div>
</div>
<p>By default <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> tries to preserve and restore the <code class="docutils literal notranslate"><span class="pre">.index</span></code>
data as accurately as possible. See the section below for more about
this, and how to disable this logic.</p>
</div>
<div class="section" id="series">
<h2>Series<a class="headerlink" href="#series" title="Permalink to this headline"></a></h2>
<p>In Arrow, the most similar structure to a pandas Series is an Array.
It is a vector that contains data of the same type as linear memory. You can
convert a pandas Series to an Arrow Array using <a class="reference internal" href="generated/pyarrow.Array.html#pyarrow.Array.from_pandas" title="pyarrow.Array.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Array.from_pandas()</span></code></a>.
As Arrow Arrays are always nullable, you can supply an optional mask using
the <code class="docutils literal notranslate"><span class="pre">mask</span></code> parameter to mark all null-entries.</p>
</div>
<div class="section" id="handling-pandas-indexes">
<h2>Handling pandas Indexes<a class="headerlink" href="#handling-pandas-indexes" title="Permalink to this headline"></a></h2>
<p>Methods like <a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.from_pandas" title="pyarrow.Table.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_pandas()</span></code></a> have a
<code class="docutils literal notranslate"><span class="pre">preserve_index</span></code> option which defines how to preserve (store) or not
to preserve (to not store) the data in the <code class="docutils literal notranslate"><span class="pre">index</span></code> member of the
corresponding pandas object. This data is tracked using schema-level
metadata in the internal <code class="docutils literal notranslate"><span class="pre">arrow::Schema</span></code> object.</p>
<p>The default of <code class="docutils literal notranslate"><span class="pre">preserve_index</span></code> is <code class="docutils literal notranslate"><span class="pre">None</span></code>, which behaves as
follows:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">RangeIndex</span></code> is stored as metadata-only, not requiring any extra
storage.</p></li>
<li><p>Other index types are stored as one or more physical data columns in
the resulting <code class="xref py py-class docutils literal notranslate"><span class="pre">Table</span></code></p></li>
</ul>
<p>To not store the index at all pass <code class="docutils literal notranslate"><span class="pre">preserve_index=False</span></code>. Since
storing a <code class="docutils literal notranslate"><span class="pre">RangeIndex</span></code> can cause issues in some limited scenarios
(such as storing multiple DataFrame objects in a Parquet file), to
force all index data to be serialized in the resulting table, pass
<code class="docutils literal notranslate"><span class="pre">preserve_index=True</span></code>.</p>
</div>
<div class="section" id="type-differences">
<h2>Type differences<a class="headerlink" href="#type-differences" title="Permalink to this headline"></a></h2>
<p>With the current design of pandas and Arrow, it is not possible to convert all
column types unmodified. One of the main issues here is that pandas has no
support for nullable columns of arbitrary type. Also <code class="docutils literal notranslate"><span class="pre">datetime64</span></code> is currently
fixed to nanosecond resolution. On the other side, Arrow might be still missing
support for some types.</p>
<div class="section" id="pandas-arrow-conversion">
<h3>pandas -&gt; Arrow Conversion<a class="headerlink" href="#pandas-arrow-conversion" title="Permalink to this headline"></a></h3>
<table class="docutils align-default">
<colgroup>
<col style="width: 48%" />
<col style="width: 52%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Source Type (pandas)</p></th>
<th class="head"><p>Destination Type (Arrow)</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">bool</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">(u)int{8,16,32,64}</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">float32</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">FLOAT</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">DOUBLE</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">str</span></code> / <code class="docutils literal notranslate"><span class="pre">unicode</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">STRING</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">pd.Categorical</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">DICTIONARY</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">pd.Timestamp</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">TIMESTAMP(unit=ns)</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">datetime.date</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">DATE</span></code></p></td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="arrow-pandas-conversion">
<h3>Arrow -&gt; pandas Conversion<a class="headerlink" href="#arrow-pandas-conversion" title="Permalink to this headline"></a></h3>
<table class="docutils align-default">
<colgroup>
<col style="width: 40%" />
<col style="width: 60%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Source Type (Arrow)</p></th>
<th class="head"><p>Destination Type (pandas)</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">bool</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code> <em>with nulls</em></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">object</span></code> (with values <code class="docutils literal notranslate"><span class="pre">True</span></code>, <code class="docutils literal notranslate"><span class="pre">False</span></code>, <code class="docutils literal notranslate"><span class="pre">None</span></code>)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">(u)int{8,16,32,64}</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code> <em>with nulls</em></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">FLOAT</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">float32</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DOUBLE</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">STRING</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">str</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DICTIONARY</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">pd.Categorical</span></code></p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">TIMESTAMP(unit=*)</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">pd.Timestamp</span></code> (<code class="docutils literal notranslate"><span class="pre">np.datetime64[ns]</span></code>)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DATE</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">object``(with</span> <span class="pre">``datetime.date</span></code> objects)</p></td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="categorical-types">
<h3>Categorical types<a class="headerlink" href="#categorical-types" title="Permalink to this headline"></a></h3>
<p>TODO</p>
</div>
<div class="section" id="datetime-timestamp-types">
<h3>Datetime (Timestamp) types<a class="headerlink" href="#datetime-timestamp-types" title="Permalink to this headline"></a></h3>
<p>TODO</p>
</div>
<div class="section" id="date-types">
<h3>Date types<a class="headerlink" href="#date-types" title="Permalink to this headline"></a></h3>
<p>While dates can be handled using the <code class="docutils literal notranslate"><span class="pre">datetime64[ns]</span></code> type in
pandas, some systems work with object arrays of Python’s built-in
<code class="docutils literal notranslate"><span class="pre">datetime.date</span></code> object:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [3]: </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">date</span>
<span class="gp">In [4]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="n">date</span><span class="p">(</span><span class="mi">2018</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">31</span><span class="p">),</span> <span class="bp">None</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)])</span>
<span class="gp">In [5]: </span><span class="n">s</span>
<span class="gh">Out[5]: </span><span class="go"></span>
<span class="go">0 2018-12-31</span>
<span class="go">1 None</span>
<span class="go">2 2000-01-01</span>
<span class="go">dtype: object</span>
</pre></div>
</div>
<p>When converting to an Arrow array, the <code class="docutils literal notranslate"><span class="pre">date32</span></code> type will be used by
default:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [6]: </span><span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
<span class="gp">In [7]: </span><span class="n">arr</span><span class="o">.</span><span class="n">type</span>
<span class="gh">Out[7]: </span><span class="go">DataType(date32[day])</span>
<span class="gp">In [8]: </span><span class="n">arr</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="gh">Out[8]: </span><span class="go">&lt;pyarrow.Date32Scalar: datetime.date(2018, 12, 31)&gt;</span>
</pre></div>
</div>
<p>To use the 64-bit <code class="docutils literal notranslate"><span class="pre">date64</span></code>, specify this explicitly:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [9]: </span><span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s1">&#39;date64&#39;</span><span class="p">)</span>
<span class="gp">In [10]: </span><span class="n">arr</span><span class="o">.</span><span class="n">type</span>
<span class="gh">Out[10]: </span><span class="go">DataType(date64[ms])</span>
</pre></div>
</div>
<p>When converting back with <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code>, object arrays of
<code class="docutils literal notranslate"><span class="pre">datetime.date</span></code> objects are returned:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [11]: </span><span class="n">arr</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[11]: </span><span class="go"></span>
<span class="go">0 2018-12-31</span>
<span class="go">1 None</span>
<span class="go">2 2000-01-01</span>
<span class="go">dtype: object</span>
</pre></div>
</div>
<p>If you want to use NumPy’s <code class="docutils literal notranslate"><span class="pre">datetime64</span></code> dtype instead, pass
<code class="docutils literal notranslate"><span class="pre">date_as_object=False</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [12]: </span><span class="n">s2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">arr</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(</span><span class="n">date_as_object</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span>
<span class="gp">In [13]: </span><span class="n">s2</span><span class="o">.</span><span class="n">dtype</span>
<span class="gh">Out[13]: </span><span class="go">dtype(&#39;&lt;M8[ns]&#39;)</span>
</pre></div>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>As of Arrow <code class="docutils literal notranslate"><span class="pre">0.13</span></code> the parameter <code class="docutils literal notranslate"><span class="pre">date_as_object</span></code> is <code class="docutils literal notranslate"><span class="pre">True</span></code>
by default. Older versions must pass <code class="docutils literal notranslate"><span class="pre">date_as_object=True</span></code> to
obtain this behavior</p>
</div>
</div>
<div class="section" id="time-types">
<h3>Time types<a class="headerlink" href="#time-types" title="Permalink to this headline"></a></h3>
<p>TODO</p>
</div>
</div>
<div class="section" id="memory-usage-and-zero-copy">
<h2>Memory Usage and Zero Copy<a class="headerlink" href="#memory-usage-and-zero-copy" title="Permalink to this headline"></a></h2>
<p>When converting from Arrow data structures to pandas objects using various
<code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> methods, one must occasionally be mindful of issues related to
performance and memory usage.</p>
<p>Since pandas’s internal data representation is generally different from the
Arrow columnar format, zero copy conversions (where no memory allocation or
computation is required) are only possible in certain limited cases.</p>
<p>In the worst case scenario, calling <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> will result in two versions
of the data in memory, one for Arrow and one for pandas, yielding approximately
twice the memory footprint. We have implement some mitigations for this case,
particularly when creating large <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> objects, that we describe below.</p>
<div class="section" id="zero-copy-series-conversions">
<h3>Zero Copy Series Conversions<a class="headerlink" href="#zero-copy-series-conversions" title="Permalink to this headline"></a></h3>
<p>Zero copy conversions from <code class="docutils literal notranslate"><span class="pre">Array</span></code> or <code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code> to NumPy arrays or
pandas Series are possible in certain narrow cases:</p>
<ul class="simple">
<li><p>The Arrow data is stored in an integer (signed or unsigned <code class="docutils literal notranslate"><span class="pre">int8</span></code> through
<code class="docutils literal notranslate"><span class="pre">int64</span></code>) or floating point type (<code class="docutils literal notranslate"><span class="pre">float16</span></code> through <code class="docutils literal notranslate"><span class="pre">float64</span></code>). This
includes many numeric types as well as timestamps.</p></li>
<li><p>The Arrow data has no null values (since these are represented using bitmaps
which are not supported by pandas).</p></li>
<li><p>For <code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code>, the data consists of a single chunk,
i.e. <code class="docutils literal notranslate"><span class="pre">arr.num_chunks</span> <span class="pre">==</span> <span class="pre">1</span></code>. Multiple chunks will always require a copy
because of pandas’s contiguousness requirement.</p></li>
</ul>
<p>In these scenarios, <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> or <code class="docutils literal notranslate"><span class="pre">to_numpy</span></code> will be zero copy. In all
other scenarios, a copy will be required.</p>
</div>
<div class="section" id="reducing-memory-use-in-table-to-pandas">
<h3>Reducing Memory Use in <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code><a class="headerlink" href="#reducing-memory-use-in-table-to-pandas" title="Permalink to this headline"></a></h3>
<p>As of this writing, pandas applies a data management strategy called
“consolidation” to collect like-typed DataFrame columns in two-dimensional
NumPy arrays, referred to internally as “blocks”. We have gone to great effort
to construct the precise “consolidated” blocks so that pandas will not perform
any further allocation or copies after we hand off the data to
<code class="docutils literal notranslate"><span class="pre">pandas.DataFrame</span></code>. The obvious downside of this consolidation strategy is
that it forces a “memory doubling”.</p>
<p>To try to limit the potential effects of “memory doubling” during
<code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code>, we provide a couple of options:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">split_blocks=True</span></code>, when enabled <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code> produces one internal
DataFrame “block” for each column, skipping the “consolidation” step. Note
that many pandas operations will trigger consolidation anyway, but the peak
memory use may be less than the worst case scenario of a full memory
doubling. As a result of this option, we are able to do zero copy conversions
of columns in the same cases where we can do zero copy with <code class="docutils literal notranslate"><span class="pre">Array</span></code> and
<code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">self_destruct=True</span></code>, this destroys the internal Arrow memory buffers in
each column <code class="docutils literal notranslate"><span class="pre">Table</span></code> object as they are converted to the pandas-compatible
representation, potentially releasing memory to the operating system as soon
as a column is converted. Note that this renders the calling <code class="docutils literal notranslate"><span class="pre">Table</span></code> object
unsafe for further use, and any further methods called will cause your Python
process to crash.</p></li>
</ul>
<p>Used together, the call</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(</span><span class="n">split_blocks</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">self_destruct</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="k">del</span> <span class="n">table</span> <span class="c1"># not necessary, but a good practice</span>
</pre></div>
</div>
<p>will yield significantly lower memory usage in some scenarios. Without these
options, <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> will always double memory.</p>
</div>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="timestamps.html" class="btn btn-neutral float-right" title="Timestamps" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
<a href="numpy.html" class="btn btn-neutral float-left" title="NumPy Integration" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&#169; Copyright 2016-2019 Apache Software Foundation.
</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body>
</html>