| |
| |
| |
| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" > |
| <head> |
| <meta charset="utf-8" /> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| |
| <title>Pandas Integration — Apache Arrow v3.0.0</title> |
| |
| |
| |
| <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/theme_overrides.css" type="text/css" /> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <!--[if lt IE 9]> |
| <script src="../_static/js/html5shiv.min.js"></script> |
| <![endif]--> |
| |
| |
| <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| |
| <script type="text/javascript" src="../_static/js/theme.js"></script> |
| |
| |
| <link rel="canonical" href="https://arrow.apache.org/docs/python/pandas.html" /> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Timestamps" href="timestamps.html" /> |
| <link rel="prev" title="NumPy Integration" href="numpy.html" /> |
|
|
|
|
| <!-- Matomo -->
|
| <script>
|
| var _paq = window._paq = window._paq || [];
|
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
|
| _paq.push(["setDoNotTrack", true]);
|
| _paq.push(["disableCookies"]);
|
| _paq.push(['trackPageView']);
|
| _paq.push(['enableLinkTracking']);
|
| (function() {
|
| var u="https://analytics.apache.org/";
|
| _paq.push(['setTrackerUrl', u+'matomo.php']);
|
| _paq.push(['setSiteId', '20']);
|
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
|
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
|
| })();
|
| </script>
|
| <!-- End Matomo Code -->
|
|
|
| </head> |
| |
| <body class="wy-body-for-nav"> |
| |
| |
| <div class="wy-grid-for-nav"> |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../index.html" class="icon icon-home"> Apache Arrow |
| |
| |
| |
| </a> |
| |
| |
| |
| |
| <div class="version"> |
| 3.0.0 |
| </div> |
| |
| |
| |
| |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| |
| |
| |
| |
| |
| |
| <p class="caption"><span class="caption-text">Specifications and Protocols</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/CStreamInterface.html">The Arrow C stream interface</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li> |
| </ul> |
| <p class="caption"><span class="caption-text">Libraries</span></p> |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/julia/Arrow/README.md">Julia</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li> |
| <li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current"> |
| <li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="compute.html">Compute Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="ipc.html">Streaming, Serialization, and IPC</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="filesystems.html">Filesystem Interface</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li> |
| <li class="toctree-l2 current"><a class="current reference internal" href="#">Pandas Integration</a><ul> |
| <li class="toctree-l3"><a class="reference internal" href="#dataframes">DataFrames</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#series">Series</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#handling-pandas-indexes">Handling pandas Indexes</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#type-differences">Type differences</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#pandas-arrow-conversion">pandas -> Arrow Conversion</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#arrow-pandas-conversion">Arrow -> pandas Conversion</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#categorical-types">Categorical types</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datetime-timestamp-types">Datetime (Timestamp) types</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#date-types">Date types</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#time-types">Time types</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="#memory-usage-and-zero-copy">Memory Usage and Zero Copy</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#zero-copy-series-conversions">Zero Copy Series Conversions</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#reducing-memory-use-in-table-to-pandas">Reducing Memory Use in <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code></a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="dataset.html">Tabular Datasets</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li> |
| </ul> |
| <p class="caption"><span class="caption-text">Development</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li> |
| </ul> |
| |
| |
| |
| </div> |
| |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| |
| <nav class="wy-nav-top" aria-label="top navigation"> |
| |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../index.html">Apache Arrow</a> |
| |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| |
| <div class="rst-content"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| |
| <ul class="wy-breadcrumbs"> |
| |
| <li><a href="../index.html" class="icon icon-home"></a> »</li> |
| |
| <li><a href="index.html">Python bindings</a> »</li> |
| |
| <li>Pandas Integration</li> |
| |
| |
| <li class="wy-breadcrumbs-aside"> |
| |
| |
| <a href="../_sources/python/pandas.rst.txt" rel="nofollow"> View page source</a> |
| |
| |
| </li> |
| |
| </ul> |
| |
| |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <div class="section" id="pandas-integration"> |
| <span id="pandas-interop"></span><h1>Pandas Integration<a class="headerlink" href="#pandas-integration" title="Permalink to this headline">¶</a></h1> |
| <p>To interface with <a class="reference external" href="https://pandas.pydata.org/">pandas</a>, PyArrow provides |
| various conversion routines to consume pandas structures and convert back |
| to them.</p> |
| <div class="admonition note"> |
| <p class="admonition-title">Note</p> |
| <p>While pandas uses NumPy as a backend, it has enough peculiarities |
| (such as a different type system, and support for null values) that this |
| is a separate topic from <a class="reference internal" href="numpy.html#numpy-interop"><span class="std std-ref">NumPy Integration</span></a>.</p> |
| </div> |
| <p>To follow examples in this document, make sure to run:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span> |
| |
| <span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span> |
| </pre></div> |
| </div> |
| <div class="section" id="dataframes"> |
| <h2>DataFrames<a class="headerlink" href="#dataframes" title="Permalink to this headline">¶</a></h2> |
| <p>The equivalent to a pandas DataFrame in Arrow is a <a class="reference internal" href="data.html#data-table"><span class="std std-ref">Table</span></a>. |
| Both consist of a set of named columns of equal length. While pandas only |
| supports flat columns, the Table also provides nested columns, thus it can |
| represent more data than a DataFrame, so a full conversion is not always possible.</p> |
| <p>Conversion from a Table to a DataFrame is done by calling |
| <a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.to_pandas" title="pyarrow.Table.to_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.to_pandas()</span></code></a>. The inverse is then achieved by using |
| <a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.from_pandas" title="pyarrow.Table.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_pandas()</span></code></a>.</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="kn">as</span> <span class="nn">pd</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">"a"</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]})</span> |
| <span class="c1"># Convert from pandas to Arrow</span> |
| <span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="c1"># Convert back to pandas</span> |
| <span class="n">df_new</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| |
| <span class="c1"># Infer Arrow schema from pandas</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Schema</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>By default <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> tries to preserve and restore the <code class="docutils literal notranslate"><span class="pre">.index</span></code> |
| data as accurately as possible. See the section below for more about |
| this, and how to disable this logic.</p> |
| </div> |
| <div class="section" id="series"> |
| <h2>Series<a class="headerlink" href="#series" title="Permalink to this headline">¶</a></h2> |
| <p>In Arrow, the most similar structure to a pandas Series is an Array. |
| It is a vector that contains data of the same type as linear memory. You can |
| convert a pandas Series to an Arrow Array using <a class="reference internal" href="generated/pyarrow.Array.html#pyarrow.Array.from_pandas" title="pyarrow.Array.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Array.from_pandas()</span></code></a>. |
| As Arrow Arrays are always nullable, you can supply an optional mask using |
| the <code class="docutils literal notranslate"><span class="pre">mask</span></code> parameter to mark all null-entries.</p> |
| </div> |
| <div class="section" id="handling-pandas-indexes"> |
| <h2>Handling pandas Indexes<a class="headerlink" href="#handling-pandas-indexes" title="Permalink to this headline">¶</a></h2> |
| <p>Methods like <a class="reference internal" href="generated/pyarrow.Table.html#pyarrow.Table.from_pandas" title="pyarrow.Table.from_pandas"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_pandas()</span></code></a> have a |
| <code class="docutils literal notranslate"><span class="pre">preserve_index</span></code> option which defines how to preserve (store) or not |
| to preserve (to not store) the data in the <code class="docutils literal notranslate"><span class="pre">index</span></code> member of the |
| corresponding pandas object. This data is tracked using schema-level |
| metadata in the internal <code class="docutils literal notranslate"><span class="pre">arrow::Schema</span></code> object.</p> |
| <p>The default of <code class="docutils literal notranslate"><span class="pre">preserve_index</span></code> is <code class="docutils literal notranslate"><span class="pre">None</span></code>, which behaves as |
| follows:</p> |
| <ul class="simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">RangeIndex</span></code> is stored as metadata-only, not requiring any extra |
| storage.</p></li> |
| <li><p>Other index types are stored as one or more physical data columns in |
| the resulting <code class="xref py py-class docutils literal notranslate"><span class="pre">Table</span></code></p></li> |
| </ul> |
| <p>To not store the index at all pass <code class="docutils literal notranslate"><span class="pre">preserve_index=False</span></code>. Since |
| storing a <code class="docutils literal notranslate"><span class="pre">RangeIndex</span></code> can cause issues in some limited scenarios |
| (such as storing multiple DataFrame objects in a Parquet file), to |
| force all index data to be serialized in the resulting table, pass |
| <code class="docutils literal notranslate"><span class="pre">preserve_index=True</span></code>.</p> |
| </div> |
| <div class="section" id="type-differences"> |
| <h2>Type differences<a class="headerlink" href="#type-differences" title="Permalink to this headline">¶</a></h2> |
| <p>With the current design of pandas and Arrow, it is not possible to convert all |
| column types unmodified. One of the main issues here is that pandas has no |
| support for nullable columns of arbitrary type. Also <code class="docutils literal notranslate"><span class="pre">datetime64</span></code> is currently |
| fixed to nanosecond resolution. On the other side, Arrow might be still missing |
| support for some types.</p> |
| <div class="section" id="pandas-arrow-conversion"> |
| <h3>pandas -> Arrow Conversion<a class="headerlink" href="#pandas-arrow-conversion" title="Permalink to this headline">¶</a></h3> |
| <table class="docutils align-default"> |
| <colgroup> |
| <col style="width: 48%" /> |
| <col style="width: 52%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Source Type (pandas)</p></th> |
| <th class="head"><p>Destination Type (Arrow)</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">bool</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">(u)int{8,16,32,64}</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">float32</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">FLOAT</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">DOUBLE</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">str</span></code> / <code class="docutils literal notranslate"><span class="pre">unicode</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">STRING</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">pd.Categorical</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">DICTIONARY</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">pd.Timestamp</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">TIMESTAMP(unit=ns)</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">datetime.date</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">DATE</span></code></p></td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <div class="section" id="arrow-pandas-conversion"> |
| <h3>Arrow -> pandas Conversion<a class="headerlink" href="#arrow-pandas-conversion" title="Permalink to this headline">¶</a></h3> |
| <table class="docutils align-default"> |
| <colgroup> |
| <col style="width: 40%" /> |
| <col style="width: 60%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Source Type (Arrow)</p></th> |
| <th class="head"><p>Destination Type (pandas)</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">bool</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">BOOL</span></code> <em>with nulls</em></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">object</span></code> (with values <code class="docutils literal notranslate"><span class="pre">True</span></code>, <code class="docutils literal notranslate"><span class="pre">False</span></code>, <code class="docutils literal notranslate"><span class="pre">None</span></code>)</p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">(u)int{8,16,32,64}</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">(U)INT{8,16,32,64}</span></code> <em>with nulls</em></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">FLOAT</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">float32</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DOUBLE</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">float64</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">STRING</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">str</span></code></p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DICTIONARY</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">pd.Categorical</span></code></p></td> |
| </tr> |
| <tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">TIMESTAMP(unit=*)</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">pd.Timestamp</span></code> (<code class="docutils literal notranslate"><span class="pre">np.datetime64[ns]</span></code>)</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">DATE</span></code></p></td> |
| <td><p><code class="docutils literal notranslate"><span class="pre">object``(with</span> <span class="pre">``datetime.date</span></code> objects)</p></td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <div class="section" id="categorical-types"> |
| <h3>Categorical types<a class="headerlink" href="#categorical-types" title="Permalink to this headline">¶</a></h3> |
| <p>TODO</p> |
| </div> |
| <div class="section" id="datetime-timestamp-types"> |
| <h3>Datetime (Timestamp) types<a class="headerlink" href="#datetime-timestamp-types" title="Permalink to this headline">¶</a></h3> |
| <p>TODO</p> |
| </div> |
| <div class="section" id="date-types"> |
| <h3>Date types<a class="headerlink" href="#date-types" title="Permalink to this headline">¶</a></h3> |
| <p>While dates can be handled using the <code class="docutils literal notranslate"><span class="pre">datetime64[ns]</span></code> type in |
| pandas, some systems work with object arrays of Python’s built-in |
| <code class="docutils literal notranslate"><span class="pre">datetime.date</span></code> object:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [3]: </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">date</span> |
| |
| <span class="gp">In [4]: </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="n">date</span><span class="p">(</span><span class="mi">2018</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">31</span><span class="p">),</span> <span class="bp">None</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)])</span> |
| |
| <span class="gp">In [5]: </span><span class="n">s</span> |
| <span class="gh">Out[5]: </span><span class="go"></span> |
| <span class="go">0 2018-12-31</span> |
| <span class="go">1 None</span> |
| <span class="go">2 2000-01-01</span> |
| <span class="go">dtype: object</span> |
| </pre></div> |
| </div> |
| <p>When converting to an Arrow array, the <code class="docutils literal notranslate"><span class="pre">date32</span></code> type will be used by |
| default:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [6]: </span><span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> |
| |
| <span class="gp">In [7]: </span><span class="n">arr</span><span class="o">.</span><span class="n">type</span> |
| <span class="gh">Out[7]: </span><span class="go">DataType(date32[day])</span> |
| |
| <span class="gp">In [8]: </span><span class="n">arr</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="gh">Out[8]: </span><span class="go"><pyarrow.Date32Scalar: datetime.date(2018, 12, 31)></span> |
| </pre></div> |
| </div> |
| <p>To use the 64-bit <code class="docutils literal notranslate"><span class="pre">date64</span></code>, specify this explicitly:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [9]: </span><span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s1">'date64'</span><span class="p">)</span> |
| |
| <span class="gp">In [10]: </span><span class="n">arr</span><span class="o">.</span><span class="n">type</span> |
| <span class="gh">Out[10]: </span><span class="go">DataType(date64[ms])</span> |
| </pre></div> |
| </div> |
| <p>When converting back with <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code>, object arrays of |
| <code class="docutils literal notranslate"><span class="pre">datetime.date</span></code> objects are returned:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [11]: </span><span class="n">arr</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[11]: </span><span class="go"></span> |
| <span class="go">0 2018-12-31</span> |
| <span class="go">1 None</span> |
| <span class="go">2 2000-01-01</span> |
| <span class="go">dtype: object</span> |
| </pre></div> |
| </div> |
| <p>If you want to use NumPy’s <code class="docutils literal notranslate"><span class="pre">datetime64</span></code> dtype instead, pass |
| <code class="docutils literal notranslate"><span class="pre">date_as_object=False</span></code>:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [12]: </span><span class="n">s2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">arr</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(</span><span class="n">date_as_object</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span> |
| |
| <span class="gp">In [13]: </span><span class="n">s2</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="gh">Out[13]: </span><span class="go">dtype('<M8[ns]')</span> |
| </pre></div> |
| </div> |
| <div class="admonition warning"> |
| <p class="admonition-title">Warning</p> |
| <p>As of Arrow <code class="docutils literal notranslate"><span class="pre">0.13</span></code> the parameter <code class="docutils literal notranslate"><span class="pre">date_as_object</span></code> is <code class="docutils literal notranslate"><span class="pre">True</span></code> |
| by default. Older versions must pass <code class="docutils literal notranslate"><span class="pre">date_as_object=True</span></code> to |
| obtain this behavior</p> |
| </div> |
| </div> |
| <div class="section" id="time-types"> |
| <h3>Time types<a class="headerlink" href="#time-types" title="Permalink to this headline">¶</a></h3> |
| <p>TODO</p> |
| </div> |
| </div> |
| <div class="section" id="memory-usage-and-zero-copy"> |
| <h2>Memory Usage and Zero Copy<a class="headerlink" href="#memory-usage-and-zero-copy" title="Permalink to this headline">¶</a></h2> |
| <p>When converting from Arrow data structures to pandas objects using various |
| <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> methods, one must occasionally be mindful of issues related to |
| performance and memory usage.</p> |
| <p>Since pandas’s internal data representation is generally different from the |
| Arrow columnar format, zero copy conversions (where no memory allocation or |
| computation is required) are only possible in certain limited cases.</p> |
| <p>In the worst case scenario, calling <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> will result in two versions |
| of the data in memory, one for Arrow and one for pandas, yielding approximately |
| twice the memory footprint. We have implement some mitigations for this case, |
| particularly when creating large <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> objects, that we describe below.</p> |
| <div class="section" id="zero-copy-series-conversions"> |
| <h3>Zero Copy Series Conversions<a class="headerlink" href="#zero-copy-series-conversions" title="Permalink to this headline">¶</a></h3> |
| <p>Zero copy conversions from <code class="docutils literal notranslate"><span class="pre">Array</span></code> or <code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code> to NumPy arrays or |
| pandas Series are possible in certain narrow cases:</p> |
| <ul class="simple"> |
| <li><p>The Arrow data is stored in an integer (signed or unsigned <code class="docutils literal notranslate"><span class="pre">int8</span></code> through |
| <code class="docutils literal notranslate"><span class="pre">int64</span></code>) or floating point type (<code class="docutils literal notranslate"><span class="pre">float16</span></code> through <code class="docutils literal notranslate"><span class="pre">float64</span></code>). This |
| includes many numeric types as well as timestamps.</p></li> |
| <li><p>The Arrow data has no null values (since these are represented using bitmaps |
| which are not supported by pandas).</p></li> |
| <li><p>For <code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code>, the data consists of a single chunk, |
| i.e. <code class="docutils literal notranslate"><span class="pre">arr.num_chunks</span> <span class="pre">==</span> <span class="pre">1</span></code>. Multiple chunks will always require a copy |
| because of pandas’s contiguousness requirement.</p></li> |
| </ul> |
| <p>In these scenarios, <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> or <code class="docutils literal notranslate"><span class="pre">to_numpy</span></code> will be zero copy. In all |
| other scenarios, a copy will be required.</p> |
| </div> |
| <div class="section" id="reducing-memory-use-in-table-to-pandas"> |
| <h3>Reducing Memory Use in <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code><a class="headerlink" href="#reducing-memory-use-in-table-to-pandas" title="Permalink to this headline">¶</a></h3> |
| <p>As of this writing, pandas applies a data management strategy called |
| “consolidation” to collect like-typed DataFrame columns in two-dimensional |
| NumPy arrays, referred to internally as “blocks”. We have gone to great effort |
| to construct the precise “consolidated” blocks so that pandas will not perform |
| any further allocation or copies after we hand off the data to |
| <code class="docutils literal notranslate"><span class="pre">pandas.DataFrame</span></code>. The obvious downside of this consolidation strategy is |
| that it forces a “memory doubling”.</p> |
| <p>To try to limit the potential effects of “memory doubling” during |
| <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code>, we provide a couple of options:</p> |
| <ul class="simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">split_blocks=True</span></code>, when enabled <code class="docutils literal notranslate"><span class="pre">Table.to_pandas</span></code> produces one internal |
| DataFrame “block” for each column, skipping the “consolidation” step. Note |
| that many pandas operations will trigger consolidation anyway, but the peak |
| memory use may be less than the worst case scenario of a full memory |
| doubling. As a result of this option, we are able to do zero copy conversions |
| of columns in the same cases where we can do zero copy with <code class="docutils literal notranslate"><span class="pre">Array</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">ChunkedArray</span></code>.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">self_destruct=True</span></code>, this destroys the internal Arrow memory buffers in |
| each column <code class="docutils literal notranslate"><span class="pre">Table</span></code> object as they are converted to the pandas-compatible |
| representation, potentially releasing memory to the operating system as soon |
| as a column is converted. Note that this renders the calling <code class="docutils literal notranslate"><span class="pre">Table</span></code> object |
| unsafe for further use, and any further methods called will cause your Python |
| process to crash.</p></li> |
| </ul> |
| <p>Used together, the call</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(</span><span class="n">split_blocks</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">self_destruct</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> |
| <span class="k">del</span> <span class="n">table</span> <span class="c1"># not necessary, but a good practice</span> |
| </pre></div> |
| </div> |
| <p>will yield significantly lower memory usage in some scenarios. Without these |
| options, <code class="docutils literal notranslate"><span class="pre">to_pandas</span></code> will always double memory.</p> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| </div> |
| <footer> |
| <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> |
| <a href="timestamps.html" class="btn btn-neutral float-right" title="Timestamps" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a> |
| <a href="numpy.html" class="btn btn-neutral float-left" title="NumPy Integration" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a> |
| </div> |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p> |
| © Copyright 2016-2019 Apache Software Foundation. |
| |
| </p> |
| </div> |
| |
| |
| |
| Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a |
| |
| <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> |
| |
| provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| </footer> |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| |
| <script type="text/javascript"> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| |
| |
| |
| |
| |
| |
| <script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body> |
| </html> |