blob: a785f5bd2c195729a5630ce86220b96602aeebbe [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Working with Schema &#8212; Apache Arrow Python Cookbook documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=39aeeac0" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="icon" href="_static/favicon.ico"/>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Data Manipulation" href="data.html" />
<link rel="prev" title="Creating Arrow Objects" href="create.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<section id="working-with-schema">
<h1><a class="toc-backref" href="#id1" role="doc-backlink">Working with Schema</a><a class="headerlink" href="#working-with-schema" title="Link to this heading"></a></h1>
<p>Arrow automatically infers the most appropriate data type when reading in data
or converting Python objects to Arrow objects.</p>
<p>However, you might want to manually tell Arrow which data types to
use, for example, to ensure interoperability with databases and data warehouse
systems. This chapter includes recipes for dealing with schemas.</p>
<nav class="contents" id="contents">
<p class="topic-title">Contents</p>
<ul class="simple">
<li><p><a class="reference internal" href="#working-with-schema" id="id1">Working with Schema</a></p>
<ul>
<li><p><a class="reference internal" href="#setting-the-data-type-of-an-arrow-array" id="id2">Setting the data type of an Arrow Array</a></p></li>
<li><p><a class="reference internal" href="#setting-the-schema-of-a-table" id="id3">Setting the schema of a Table</a></p></li>
<li><p><a class="reference internal" href="#merging-multiple-schemas" id="id4">Merging multiple schemas</a></p></li>
</ul>
</li>
</ul>
</nav>
<section id="setting-the-data-type-of-an-arrow-array">
<h2><a class="toc-backref" href="#id2" role="doc-backlink">Setting the data type of an Arrow Array</a><a class="headerlink" href="#setting-the-data-type-of-an-arrow-array" title="Link to this heading"></a></h2>
<p>If you have an existing array and want to change its data type,
that can be done through the <code class="docutils literal notranslate"><span class="pre">cast</span></code> function:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">arr</span><span class="o">.</span><span class="n">type</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>int64
</pre></div>
</div>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">arr</span> <span class="o">=</span> <span class="n">arr</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">())</span>
<span class="nb">print</span><span class="p">(</span><span class="n">arr</span><span class="o">.</span><span class="n">type</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>int8
</pre></div>
</div>
<p>You can also create an array of the requested type by providing
the type at array creation</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">arr</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="nb">type</span><span class="o">=</span><span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">())</span>
<span class="nb">print</span><span class="p">(</span><span class="n">arr</span><span class="o">.</span><span class="n">type</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>int8
</pre></div>
</div>
</section>
<section id="setting-the-schema-of-a-table">
<h2><a class="toc-backref" href="#id3" role="doc-backlink">Setting the schema of a Table</a><a class="headerlink" href="#setting-the-schema-of-a-table" title="Link to this heading"></a></h2>
<p>Tables detain multiple columns, each with its own name
and type. The union of types and names is what defines a schema.</p>
<p>A schema in Arrow can be defined using <code class="xref py py-meth docutils literal notranslate"><span class="pre">pyarrow.schema()</span></code></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;col2&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;col3&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())</span>
<span class="p">])</span>
</pre></div>
</div>
<p>The schema can then be provided to a table when created:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">([</span>
<span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span>
<span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">],</span>
<span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]</span>
<span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
col1: int8
col2: string
col3: double
----
col1: [[1,2,3,4,5]]
col2: [[&quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,&quot;e&quot;]]
col3: [[1,2,3,4,5]]
</pre></div>
</div>
<p>Like for arrays, it’s possible to cast tables to different schemas
as far as they are compatible</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">schema_int32</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;col2&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;col3&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())</span>
<span class="p">])</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">schema_int32</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">table</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>pyarrow.Table
col1: int32
col2: string
col3: double
----
col1: [[1,2,3,4,5]]
col2: [[&quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,&quot;e&quot;]]
col3: [[1,2,3,4,5]]
</pre></div>
</div>
</section>
<section id="merging-multiple-schemas">
<h2><a class="toc-backref" href="#id4" role="doc-backlink">Merging multiple schemas</a><a class="headerlink" href="#merging-multiple-schemas" title="Link to this heading"></a></h2>
<p>When you have multiple separate groups of data that you want to combine
it might be necessary to unify their schemas to create a superset of them
that applies to all data sources.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">first_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;country&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;population&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">())</span>
<span class="p">])</span>
<span class="n">second_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;country_code&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;language&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">())</span>
<span class="p">])</span>
</pre></div>
</div>
<p><code class="xref py py-func docutils literal notranslate"><span class="pre">unify_schemas()</span></code> can be used to combine multiple schemas into
a single one:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">union_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">unify_schemas</span><span class="p">([</span><span class="n">first_schema</span><span class="p">,</span> <span class="n">second_schema</span><span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">union_schema</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>country: string
population: int32
country_code: string
language: string
</pre></div>
</div>
<p>If the combined schemas have overlapping columns, they can still be combined
as far as the colliding columns retain the same type (<code class="docutils literal notranslate"><span class="pre">country_code</span></code>):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">third_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;country_code&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;lat&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float32</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float32</span><span class="p">()),</span>
<span class="p">])</span>
<span class="n">union_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">unify_schemas</span><span class="p">([</span><span class="n">first_schema</span><span class="p">,</span> <span class="n">second_schema</span><span class="p">,</span> <span class="n">third_schema</span><span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">union_schema</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>country: string
population: int32
country_code: string
language: string
lat: float
long: float
</pre></div>
</div>
<p>If a merged field has instead diverging types in the combined schemas
then trying to merge the schemas will fail. For example if <code class="docutils literal notranslate"><span class="pre">country_code</span></code>
was a numeric instead of a string we would be unable to unify the schemas
because in <code class="docutils literal notranslate"><span class="pre">second_schema</span></code> it was already declared as a <code class="docutils literal notranslate"><span class="pre">pa.string()</span></code></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">third_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span>
<span class="p">(</span><span class="s2">&quot;country_code&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;lat&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float32</span><span class="p">()),</span>
<span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float32</span><span class="p">()),</span>
<span class="p">])</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">union_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">unify_schemas</span><span class="p">([</span><span class="n">first_schema</span><span class="p">,</span> <span class="n">second_schema</span><span class="p">,</span> <span class="n">third_schema</span><span class="p">])</span>
<span class="k">except</span> <span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">ArrowInvalid</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">ArrowTypeError</span><span class="p">)</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>Unable to merge: Field country_code has incompatible types: string vs int32
</pre></div>
</div>
</section>
</section>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo">
<a href="index.html">
<img class="logo" src="_static/arrow-logo_vertical_black-txt_transparent-bg.svg" alt="Logo"/>
</a>
</p>
<p>
<iframe src="https://ghbtns.com/github-btn.html?user=apache&repo=arrow-cookbook&type=none&count=true&size=large&v=2"
allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
</p>
<h3>Navigation</h3>
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="io.html">Reading and Writing Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="create.html">Creating Arrow Objects</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Working with Schema</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#setting-the-data-type-of-an-arrow-array">Setting the data type of an Arrow Array</a></li>
<li class="toctree-l2"><a class="reference internal" href="#setting-the-schema-of-a-table">Setting the schema of a Table</a></li>
<li class="toctree-l2"><a class="reference internal" href="#merging-multiple-schemas">Merging multiple schemas</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="data.html">Data Manipulation</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight.html">Arrow Flight</a></li>
</ul>
<hr />
<ul>
<li class="toctree-l1"><a href="https://arrow.apache.org/docs/python/index.html">User Guide</a></li>
<li class="toctree-l1"><a href="https://arrow.apache.org/docs/python/api.html">API Reference</a></li>
</ul>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
<li>Previous: <a href="create.html" title="previous chapter">Creating Arrow Objects</a></li>
<li>Next: <a href="data.html" title="next chapter">Data Manipulation</a></li>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>document.getElementById('searchbox').style.display = "block"</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2022, Apache Software Foundation.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
|
<a href="_sources/schema.rst.txt"
rel="nofollow">Page source</a>
</div>
</body>
</html>