blob: 493ecb602c0bd08754f28f6afce4a9ad12cd3b58 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Creating Arrow Objects &#8212; Apache Arrow Python Cookbook documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=49eeb2a1" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="icon" href="_static/favicon.ico"/>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Working with Schema" href="schema.html" />
<link rel="prev" title="Reading and Writing Data" href="io.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<section id="creating-arrow-objects">
<h1><a class="toc-backref" href="#id1" role="doc-backlink">Creating Arrow Objects</a><a class="headerlink" href="#creating-arrow-objects" title="Link to this heading"></a></h1>
<p>Recipes related to the creation of Arrays, Tables,
Tensors and all other Arrow entities.</p>
<nav class="contents" id="contents">
<p class="topic-title">Contents</p>
<ul class="simple">
<li><p><a class="reference internal" href="#creating-arrow-objects" id="id1">Creating Arrow Objects</a></p>
<ul>
<li><p><a class="reference internal" href="#creating-arrays" id="id2">Creating Arrays</a></p></li>
<li><p><a class="reference internal" href="#creating-tables" id="id3">Creating Tables</a></p></li>
<li><p><a class="reference internal" href="#create-table-from-plain-types" id="id4">Create Table from Plain Types</a></p></li>
<li><p><a class="reference internal" href="#creating-record-batches" id="id5">Creating Record Batches</a></p></li>
<li><p><a class="reference internal" href="#store-categorical-data" id="id6">Store Categorical Data</a></p></li>
</ul>
</li>
</ul>
</nav>
<section id="creating-arrays">
<h2><a class="toc-backref" href="#id2" role="doc-backlink">Creating Arrays</a><a class="headerlink" href="#creating-arrays" title="Link to this heading"></a></h2>
<p>Arrow keeps data in continuous arrays optimised for memory footprint
and SIMD analyses. In Python it’s possible to build <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.Array</span></code></a>
starting from Python <code class="docutils literal notranslate"><span class="pre">lists</span></code> (or sequence types in general),
<code class="docutils literal notranslate"><span class="pre">numpy</span></code> arrays and <code class="docutils literal notranslate"><span class="pre">pandas</span></code> Series.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">array</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">])</span>
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">array</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>[
1,
2,
3,
4,
5
]
</pre></div>
</div>
<p>Arrays can also provide a <code class="docutils literal notranslate"><span class="pre">mask</span></code> to specify which values should
be considered nulls</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="n">array</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span>
<span class="n">mask</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="kc">True</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="kc">True</span><span class="p">]))</span>
<span class="nb">print</span><span class="p">(</span><span class="n">array</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>[
null,
2,
null,
4,
null
]
</pre></div>
</div>
<p>When building arrays from <code class="docutils literal notranslate"><span class="pre">numpy</span></code> or <code class="docutils literal notranslate"><span class="pre">pandas</span></code>, Arrow will leverage
optimized code paths that rely on the internal in-memory representation
of the data by <code class="docutils literal notranslate"><span class="pre">numpy</span></code> and <code class="docutils literal notranslate"><span class="pre">pandas</span></code></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="n">array_from_numpy</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span>
<span class="n">array_from_pandas</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">]))</span>
</pre></div>
</div>
</section>
<section id="creating-tables">
<h2><a class="toc-backref" href="#id3" role="doc-backlink">Creating Tables</a><a class="headerlink" href="#creating-tables" title="Link to this heading"></a></h2>
<p>Arrow supports tabular data in <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.Table</span></code></a>: each column
is represented by a <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.ChunkedArray</span></code></a> and tables can be created
by pairing multiple arrays with names for their columns</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">([</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">]),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">]),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">])</span>
<span class="p">],</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">,</span> <span class="s2">&quot;col3&quot;</span><span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
col1: int64
col2: string
col3: double
----
col1: [[1,2,3,4,5]]
col2: [[&quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,&quot;e&quot;]]
col3: [[1,2,3,4,5]]
</pre></div>
</div>
</section>
<section id="create-table-from-plain-types">
<h2><a class="toc-backref" href="#id4" role="doc-backlink">Create Table from Plain Types</a><a class="headerlink" href="#create-table-from-plain-types" title="Link to this heading"></a></h2>
<p>Arrow allows fast zero copy creation of arrow arrays
from numpy and pandas arrays and series, but it’s also
possible to create Arrow Arrays and Tables from
plain Python structures.</p>
<p>The <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.table.html#pyarrow.table" title="(in Apache Arrow v15.0.1)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pyarrow.table()</span></code></a> function allows creation of Tables
from a variety of inputs, including plain python objects</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span>
<span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span>
<span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">]</span>
<span class="p">})</span>
<span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
col1: int64
col2: string
----
col1: [[1,2,3,4,5]]
col2: [[&quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,&quot;e&quot;]]
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>All values provided in the dictionary will be passed to
<a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.array.html#pyarrow.array" title="(in Apache Arrow v15.0.1)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pyarrow.array()</span></code></a> for conversion to Arrow arrays,
and will benefit from zero copy behaviour when possible.</p>
</div>
<p>The <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pylist" title="(in Apache Arrow v15.0.1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_pylist()</span></code></a> method allows the creation
of Tables from python lists of row dicts. Types are inferred if a
schema is not explicitly passed.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_pylist</span><span class="p">([</span>
<span class="p">{</span><span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="s2">&quot;a&quot;</span><span class="p">},</span>
<span class="p">{</span><span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="s2">&quot;b&quot;</span><span class="p">},</span>
<span class="p">{</span><span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="mi">3</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="s2">&quot;c&quot;</span><span class="p">},</span>
<span class="p">{</span><span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="s2">&quot;d&quot;</span><span class="p">},</span>
<span class="p">{</span><span class="s2">&quot;col1&quot;</span><span class="p">:</span> <span class="mi">5</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">:</span> <span class="s2">&quot;e&quot;</span><span class="p">}</span>
<span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
col1: int64
col2: string
----
col1: [[1,2,3,4,5]]
col2: [[&quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,&quot;e&quot;]]
</pre></div>
</div>
</section>
<section id="creating-record-batches">
<h2><a class="toc-backref" href="#id5" role="doc-backlink">Creating Record Batches</a><a class="headerlink" href="#creating-record-batches" title="Link to this heading"></a></h2>
<p>Most I/O operations in Arrow happen when shipping batches of data
to their destination. <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.RecordBatch</span></code></a> is the way
Arrow represents batches of data. A RecordBatch can be seen as a slice
of a table.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">batch</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">([</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">9</span><span class="p">]),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">10</span><span class="p">])</span>
<span class="p">],</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;odd&quot;</span><span class="p">,</span> <span class="s2">&quot;even&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>Multiple batches can be combined into a table using
<a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_batches" title="(in Apache Arrow v15.0.1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.from_batches()</span></code></a></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">second_batch</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">([</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">11</span><span class="p">,</span> <span class="mi">13</span><span class="p">,</span> <span class="mi">15</span><span class="p">,</span> <span class="mi">17</span><span class="p">,</span> <span class="mi">19</span><span class="p">]),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">12</span><span class="p">,</span> <span class="mi">14</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">20</span><span class="p">])</span>
<span class="p">],</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;odd&quot;</span><span class="p">,</span> <span class="s2">&quot;even&quot;</span><span class="p">])</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_batches</span><span class="p">([</span><span class="n">batch</span><span class="p">,</span> <span class="n">second_batch</span><span class="p">])</span>
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
odd: int64
even: int64
----
odd: [[1,3,5,7,9],[11,13,15,17,19]]
even: [[2,4,6,8,10],[12,14,16,18,20]]
</pre></div>
</div>
<p>Equally, <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.Table</span></code></a> can be converted to a list of
<a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html#pyarrow.RecordBatch" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.RecordBatch</span></code></a> using the <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_batches" title="(in Apache Arrow v15.0.1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Table.to_batches()</span></code></a>
method</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">record_batches</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">to_batches</span><span class="p">(</span><span class="n">max_chunksize</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">record_batches</span><span class="p">))</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>2
</pre></div>
</div>
</section>
<section id="store-categorical-data">
<h2><a class="toc-backref" href="#id6" role="doc-backlink">Store Categorical Data</a><a class="headerlink" href="#store-categorical-data" title="Link to this heading"></a></h2>
<p>Arrow provides the <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.DictionaryArray</span></code></a> type
to represent categorical data without the cost of
storing and repeating the categories over and over. This can reduce memory use
when columns might have large values (such as text).</p>
<p>If you have an array containing repeated categorical data,
it is possible to convert it to a <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray" title="(in Apache Arrow v15.0.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.DictionaryArray</span></code></a>
using <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.dictionary_encode" title="(in Apache Arrow v15.0.1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.Array.dictionary_encode()</span></code></a></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="s2">&quot;red&quot;</span><span class="p">,</span> <span class="s2">&quot;green&quot;</span><span class="p">,</span> <span class="s2">&quot;blue&quot;</span><span class="p">,</span> <span class="s2">&quot;blue&quot;</span><span class="p">,</span> <span class="s2">&quot;green&quot;</span><span class="p">,</span> <span class="s2">&quot;red&quot;</span><span class="p">])</span>
<span class="n">categorical</span> <span class="o">=</span> <span class="n">arr</span><span class="o">.</span><span class="n">dictionary_encode</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">categorical</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>...
-- dictionary:
[
&quot;red&quot;,
&quot;green&quot;,
&quot;blue&quot;
]
-- indices:
[
0,
1,
2,
2,
1,
0
]
</pre></div>
</div>
<p>If you already know the categories and indices then you can skip the encode
step and directly create the <code class="docutils literal notranslate"><span class="pre">DictionaryArray</span></code> using
<a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray.from_arrays" title="(in Apache Arrow v15.0.1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.DictionaryArray.from_arrays()</span></code></a></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">categorical</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">DictionaryArray</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span>
<span class="n">indices</span><span class="o">=</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span>
<span class="n">dictionary</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;red&quot;</span><span class="p">,</span> <span class="s2">&quot;green&quot;</span><span class="p">,</span> <span class="s2">&quot;blue&quot;</span><span class="p">]</span>
<span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">categorical</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>...
-- dictionary:
[
&quot;red&quot;,
&quot;green&quot;,
&quot;blue&quot;
]
-- indices:
[
0,
1,
2,
2,
1,
0
]
</pre></div>
</div>
</section>
</section>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo">
<a href="index.html">
<img class="logo" src="_static/arrow-logo_vertical_black-txt_transparent-bg.svg" alt="Logo" />
</a>
</p>
<p>
<iframe src="https://ghbtns.com/github-btn.html?user=apache&repo=arrow-cookbook&type=none&count=true&size=large&v=2"
allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
</p>
<h3>Navigation</h3>
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="io.html">Reading and Writing Data</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Creating Arrow Objects</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#creating-arrays">Creating Arrays</a></li>
<li class="toctree-l2"><a class="reference internal" href="#creating-tables">Creating Tables</a></li>
<li class="toctree-l2"><a class="reference internal" href="#create-table-from-plain-types">Create Table from Plain Types</a></li>
<li class="toctree-l2"><a class="reference internal" href="#creating-record-batches">Creating Record Batches</a></li>
<li class="toctree-l2"><a class="reference internal" href="#store-categorical-data">Store Categorical Data</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="schema.html">Working with Schema</a></li>
<li class="toctree-l1"><a class="reference internal" href="data.html">Data Manipulation</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight.html">Arrow Flight</a></li>
</ul>
<hr />
<ul>
<li class="toctree-l1"><a href="https://arrow.apache.org/docs/python/index.html">User Guide</a></li>
<li class="toctree-l1"><a href="https://arrow.apache.org/docs/python/api.html">API Reference</a></li>
</ul>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
<li>Previous: <a href="io.html" title="previous chapter">Reading and Writing Data</a></li>
<li>Next: <a href="schema.html" title="next chapter">Working with Schema</a></li>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>document.getElementById('searchbox').style.display = "block"</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&#169;2022, Apache Software Foundation.
|
Powered by <a href="https://www.sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
|
<a href="_sources/create.rst.txt"
rel="nofollow">Page source</a>
</div>
</body>
</html>