blob: d4377b550536a7eabb7bf85fe538df7a38f684e4 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.pandas.namespace &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/namespace';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/namespace.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/namespace.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/namespace.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.pandas.namespace</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.pandas.namespace</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">Wrappers around spark that correspond to common pandas functions.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Set</span><span class="p">,</span>
<span class="n">Sized</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Type</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">no_type_check</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Iterable</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">tzinfo</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">from</span> <span class="nn">io</span> <span class="kn">import</span> <span class="n">BytesIO</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">is_datetime64_dtype</span><span class="p">,</span>
<span class="n">is_list_like</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pandas.tseries.offsets</span> <span class="kn">import</span> <span class="n">DateOffset</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="k">as</span> <span class="nn">pq</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span> <span class="k">as</span> <span class="n">PySparkColumn</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">ByteType</span><span class="p">,</span>
<span class="n">ShortType</span><span class="p">,</span>
<span class="n">IntegerType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">FloatType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">TimestampType</span><span class="p">,</span>
<span class="n">TimestampNTZType</span><span class="p">,</span>
<span class="n">DecimalType</span><span class="p">,</span>
<span class="n">StringType</span><span class="p">,</span>
<span class="n">DateType</span><span class="p">,</span>
<span class="n">StructType</span><span class="p">,</span>
<span class="n">DataType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">PySparkDataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.base</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">align_diff_frames</span><span class="p">,</span>
<span class="n">default_session</span><span class="p">,</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">_reduce_spark_multi</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">DEFAULT_SERIES_NAME</span><span class="p">,</span>
<span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span><span class="p">,</span> <span class="n">DatetimeIndex</span><span class="p">,</span> <span class="n">TimedeltaIndex</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.multi</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="c1"># For Supporting Spark Connect</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">get_column_class</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;from_pandas&quot;</span><span class="p">,</span>
<span class="s2">&quot;range&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_csv&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_delta&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_spark_io&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_parquet&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_clipboard&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_excel&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_html&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_datetime&quot;</span><span class="p">,</span>
<span class="s2">&quot;date_range&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_timedelta&quot;</span><span class="p">,</span>
<span class="s2">&quot;timedelta_range&quot;</span><span class="p">,</span>
<span class="s2">&quot;get_dummies&quot;</span><span class="p">,</span>
<span class="s2">&quot;concat&quot;</span><span class="p">,</span>
<span class="s2">&quot;melt&quot;</span><span class="p">,</span>
<span class="s2">&quot;isna&quot;</span><span class="p">,</span>
<span class="s2">&quot;isnull&quot;</span><span class="p">,</span>
<span class="s2">&quot;notna&quot;</span><span class="p">,</span>
<span class="s2">&quot;notnull&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql_query&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_json&quot;</span><span class="p">,</span>
<span class="s2">&quot;merge&quot;</span><span class="p">,</span>
<span class="s2">&quot;merge_asof&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_numeric&quot;</span><span class="p">,</span>
<span class="s2">&quot;broadcast&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_orc&quot;</span><span class="p">,</span>
<span class="p">]</span>
<span class="k">def</span> <span class="nf">from_pandas</span><span class="p">(</span><span class="n">pobj</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">Index</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Create a pandas-on-Spark DataFrame, Series or Index from a pandas DataFrame, Series or Index.</span>
<span class="sd"> This is similar to Spark&#39;s `SparkSession.createDataFrame()` with pandas DataFrame,</span>
<span class="sd"> but this also works with pandas Series and picks the index.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> pobj : pandas.DataFrame or pandas.Series</span>
<span class="sd"> pandas DataFrame or Series to read.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> If a pandas Series is passed in, this function returns a pandas-on-Spark Series.</span>
<span class="sd"> If a pandas DataFrame is passed in, this function returns a pandas-on-Spark DataFrame.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Series</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">pobj</span><span class="p">))</span><span class="o">.</span><span class="n">index</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Unknown data type: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="c1"># built-in range</span>
<span class="n">_range</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="nb">range</span><span class="p">]</span> <span class="o">=</span> <span class="nb">range</span> <span class="c1"># type: ignore[assignment]</span>
<div class="viewcode-block" id="range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.range.html#pyspark.pandas.range">[docs]</a><span class="k">def</span> <span class="nf">range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">num_partitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a DataFrame with some range of numbers.</span>
<span class="sd"> The resulting DataFrame has a single int64 column named `id`, containing elements in a range</span>
<span class="sd"> from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter</span>
<span class="sd"> (i.e. start) is specified, we treat it as the end value with the start value being 0.</span>
<span class="sd"> This is like the range function in SparkSession and is used primarily for testing.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : int</span>
<span class="sd"> the start value (inclusive)</span>
<span class="sd"> end : int, optional</span>
<span class="sd"> the end value (exclusive)</span>
<span class="sd"> step : int, optional, default 1</span>
<span class="sd"> the incremental step</span>
<span class="sd"> num_partitions : int, optional</span>
<span class="sd"> the number of partitions of the DataFrame</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> When the first parameter is specified, we generate a range of values up till that number.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(5)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 4</span>
<span class="sd"> When start, end, and step are specified:</span>
<span class="sd"> &gt;&gt;&gt; ps.range(start = 100, end = 200, step = 20)</span>
<span class="sd"> id</span>
<span class="sd"> 0 100</span>
<span class="sd"> 1 120</span>
<span class="sd"> 2 140</span>
<span class="sd"> 3 160</span>
<span class="sd"> 4 180</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="n">step</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_csv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_csv.html#pyspark.pandas.read_csv">[docs]</a><span class="k">def</span> <span class="nf">read_csv</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;infer&quot;</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">quotechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Read CSV (comma-separated) file into DataFrame or Series.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : str or list</span>
<span class="sd"> Path(s) of the CSV file(s) to be read.</span>
<span class="sd"> sep : str, default ‘,’</span>
<span class="sd"> Delimiter to use. Non empty string.</span>
<span class="sd"> header : int, default ‘infer’</span>
<span class="sd"> Whether to use the column names, and the start of the data.</span>
<span class="sd"> Default behavior is to infer the column names: if no names are passed</span>
<span class="sd"> the behavior is identical to `header=0` and column names are inferred from</span>
<span class="sd"> the first line of the file, if column names are passed explicitly then</span>
<span class="sd"> the behavior is identical to `header=None`. Explicitly pass `header=0` to be</span>
<span class="sd"> able to replace existing names</span>
<span class="sd"> names : str or array-like, optional</span>
<span class="sd"> List of column names to use. If file contains no header row, then you should</span>
<span class="sd"> explicitly pass `header=None`. Duplicates in this list will cause an error to be issued.</span>
<span class="sd"> If a string is given, it should be a DDL-formatted string in Spark SQL, which is</span>
<span class="sd"> preferred to avoid schema inference for better performance.</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> usecols : list-like or callable, optional</span>
<span class="sd"> Return a subset of the columns. If list-like, all elements must either be</span>
<span class="sd"> positional (i.e. integer indices into the document columns) or strings that</span>
<span class="sd"> correspond to column names provided either by the user in names or inferred</span>
<span class="sd"> from the document header row(s).</span>
<span class="sd"> If callable, the callable function will be evaluated against the column names,</span>
<span class="sd"> returning names where the callable function evaluates to `True`.</span>
<span class="sd"> dtype : Type name or dict of column -&gt; type, default None</span>
<span class="sd"> Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} Use str or object</span>
<span class="sd"> together with suitable na_values settings to preserve and not interpret dtype.</span>
<span class="sd"> nrows : int, default None</span>
<span class="sd"> Number of rows to read from the CSV file.</span>
<span class="sd"> parse_dates : boolean or list of ints or names or list of lists or dict, default `False`.</span>
<span class="sd"> Currently only `False` is allowed.</span>
<span class="sd"> quotechar : str (length 1), optional</span>
<span class="sd"> The character used to denote the start and end of a quoted item. Quoted items can include</span>
<span class="sd"> the delimiter and it will be ignored.</span>
<span class="sd"> escapechar : str (length 1), default None</span>
<span class="sd"> One-character string used to escape other characters.</span>
<span class="sd"> comment: str, optional</span>
<span class="sd"> Indicates the line should not be parsed.</span>
<span class="sd"> encoding: str, optional</span>
<span class="sd"> Indicates the encoding to read file</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(&#39;data.csv&#39;) # doctest: +SKIP</span>
<span class="sd"> Load multiple CSV files as a single DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv([&#39;data-01.csv&#39;, &#39;data-02.csv&#39;]) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># For latin-1 encoding is same as iso-8859-1, that&#39;s why its mapped to iso-8859-1.</span>
<span class="n">encoding_mapping</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;latin-1&quot;</span><span class="p">:</span> <span class="s2">&quot;iso-8859-1&quot;</span><span class="p">}</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">parse_dates</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;parse_dates can only be `False`: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">parse_dates</span><span class="p">)</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span>
<span class="n">usecols</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;inferSchema&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;sep&quot;</span><span class="p">,</span> <span class="n">sep</span><span class="p">)</span>
<span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="s2">&quot;infer&quot;</span><span class="p">:</span>
<span class="n">header</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">names</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;header&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;header&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Unknown header argument </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">header</span><span class="p">))</span>
<span class="k">if</span> <span class="n">quotechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;quote&quot;</span><span class="p">,</span> <span class="n">quotechar</span><span class="p">)</span>
<span class="k">if</span> <span class="n">escapechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;escape&quot;</span><span class="p">,</span> <span class="n">escapechar</span><span class="p">)</span>
<span class="k">if</span> <span class="n">comment</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">comment</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">comment</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Only length-1 comment characters supported&quot;</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;comment&quot;</span><span class="p">,</span> <span class="n">comment</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">encoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;encoding&quot;</span><span class="p">,</span> <span class="n">encoding_mapping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">encoding</span><span class="p">,</span> <span class="n">encoding</span><span class="p">))</span>
<span class="n">column_labels</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">names</span><span class="p">)</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">names</span><span class="p">):</span>
<span class="n">names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">names</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">names</span><span class="p">))</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Found non-unique column index&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;The number of names [</span><span class="si">%s</span><span class="s2">] does not match the number &quot;</span>
<span class="s2">&quot;of columns [</span><span class="si">%d</span><span class="s2">]. Try names by a Spark SQL DDL-formatted &quot;</span>
<span class="s2">&quot;string.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">schema</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">enumerate</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">missing</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span>
<span class="k">if</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">usecols</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span>
<span class="n">usecols_ints</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">usecols</span><span class="p">)</span>
<span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">())</span>
<span class="k">if</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">usecols_ints</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">col</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols_ints</span>
<span class="k">if</span> <span class="p">(</span>
<span class="n">col</span> <span class="o">&gt;=</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="ow">or</span> <span class="nb">list</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)[</span><span class="n">col</span><span class="p">]</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span>
<span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">usecols</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span> <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;&#39;usecols&#39; must either be list-like of all strings, &quot;</span>
<span class="s2">&quot;all unicode, all integers or a callable.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Usecols do not match columns, columns expected but not &quot;</span> <span class="s2">&quot;found: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">missing</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()])</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">if</span> <span class="n">nrows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">nrows</span><span class="p">)</span>
<span class="n">index_spark_column_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">)):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">column_labels</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_col</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_csv`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span>
<span class="n">label</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">label</span><span class="p">,)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()],</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">tpe</span> <span class="ow">in</span> <span class="n">dtype</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">tpe</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<div class="viewcode-block" id="read_json"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_json.html#pyspark.pandas.read_json">[docs]</a><span class="k">def</span> <span class="nf">read_json</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert a JSON string to DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> File path</span>
<span class="sd"> lines : bool, default True</span>
<span class="sd"> Read the file as a JSON object per line. It should be always True for now.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/foo.json&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/foo.json&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/foo.json&#39; % path, num_files=1, lineSep=&#39;___&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/foo.json&#39; % path, lineSep=&#39;___&#39;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/bar.json&#39; % path, num_files=1, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/bar.json&#39; % path, index_col=&quot;index&quot;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> index</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_json`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;lines=False is not implemented yet.&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;json&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_delta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_delta.html#pyspark.pandas.read_delta">[docs]</a><span class="k">def</span> <span class="nf">read_delta</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">version</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a Delta Lake table on some file system and return a DataFrame.</span>
<span class="sd"> If the Delta Lake table is already stored in the catalog (aka the metastore), use &#39;read_table&#39;.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> Path to the Delta Lake table.</span>
<span class="sd"> version : string, optional</span>
<span class="sd"> Specifies the table version (based on Delta&#39;s internal transaction version) to read from,</span>
<span class="sd"> using Delta&#39;s time travel feature. This sets Delta&#39;s &#39;versionAsOf&#39; option. Note that</span>
<span class="sd"> this parameter and `timestamp` parameter cannot be used together, otherwise it will raise a</span>
<span class="sd"> `ValueError`.</span>
<span class="sd"> timestamp : string, optional</span>
<span class="sd"> Specifies the table version (based on timestamp) to read from,</span>
<span class="sd"> using Delta&#39;s time travel feature. This must be a valid date or timestamp string in Spark,</span>
<span class="sd"> and sets Delta&#39;s &#39;timestampAsOf&#39; option. Note that this parameter and `version` parameter</span>
<span class="sd"> cannot be used together, otherwise it will raise a `ValueError`.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options</span>
<span class="sd"> Additional options that can be passed onto Delta.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_delta</span>
<span class="sd"> read_table</span>
<span class="sd"> read_spark_io</span>
<span class="sd"> read_parquet</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_delta(&#39;%s/read_delta/foo&#39; % path,</span>
<span class="sd"> ... mode=&#39;overwrite&#39;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path, version=0) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_delta(</span>
<span class="sd"> ... &#39;%s/read_delta/bar&#39; % path, index_col=&quot;index&quot;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/bar&#39; % path, index_col=&quot;index&quot;) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_delta`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;version and timestamp cannot be used together.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">options</span><span class="p">[</span><span class="s2">&quot;versionAsOf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">version</span>
<span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">options</span><span class="p">[</span><span class="s2">&quot;timestampAsOf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">timestamp</span>
<span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;delta&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_table.html#pyspark.pandas.read_table">[docs]</a><span class="k">def</span> <span class="nf">read_table</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a Spark table and return a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : string</span>
<span class="sd"> Table name in Spark.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_table</span>
<span class="sd"> read_delta</span>
<span class="sd"> read_parquet</span>
<span class="sd"> read_spark_io</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_table(&#39;%s.my_table&#39; % db)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_table(&#39;%s.my_table&#39; % db)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_table(&#39;%s.my_table&#39; % db, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_table(&#39;%s.my_table&#39; % db, index_col=&quot;index&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_table`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_spark_io"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_spark_io.html#pyspark.pandas.read_spark_io">[docs]</a><span class="k">def</span> <span class="nf">read_spark_io</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;StructType&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Load a DataFrame from a Spark data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string, optional</span>
<span class="sd"> Path to the data source.</span>
<span class="sd"> format : string, optional</span>
<span class="sd"> Specifies the output data source format. Some common ones are:</span>
<span class="sd"> - &#39;delta&#39;</span>
<span class="sd"> - &#39;parquet&#39;</span>
<span class="sd"> - &#39;orc&#39;</span>
<span class="sd"> - &#39;json&#39;</span>
<span class="sd"> - &#39;csv&#39;</span>
<span class="sd"> schema : string or StructType, optional</span>
<span class="sd"> Input schema. If none, Spark tries to infer the schema automatically.</span>
<span class="sd"> The schema can either be a Spark StructType, or a DDL-formatted string like</span>
<span class="sd"> `col0 INT, col1 DOUBLE`.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.read_table</span>
<span class="sd"> DataFrame.read_delta</span>
<span class="sd"> DataFrame.read_parquet</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).spark.to_spark_io(&#39;%s/read_spark_io/data.parquet&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... &#39;%s/read_spark_io/data.parquet&#39; % path, format=&#39;parquet&#39;, schema=&#39;id long&#39;)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).spark.to_spark_io(&#39;%s/read_spark_io/data.json&#39; % path,</span>
<span class="sd"> ... format=&#39;json&#39;, lineSep=&#39;__&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... &#39;%s/read_spark_io/data.json&#39; % path, format=&#39;json&#39;, schema=&#39;id long&#39;, lineSep=&#39;__&#39;)</span>
<span class="sd"> id</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).spark.to_spark_io(&#39;%s/read_spark_io/data.orc&#39; % path,</span>
<span class="sd"> ... format=&#39;orc&#39;, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... path=r&#39;%s/read_spark_io/data.orc&#39; % path, format=&quot;orc&quot;, index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_parquet"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_parquet.html#pyspark.pandas.read_parquet">[docs]</a><span class="k">def</span> <span class="nf">read_parquet</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">pandas_metadata</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Load a parquet object from the file path, returning a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> File path</span>
<span class="sd"> columns : list, default=None</span>
<span class="sd"> If not None, only these columns will be read from the file.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> pandas_metadata : bool, default: False</span>
<span class="sd"> If True, try to respect the metadata if the Parquet file is written from pandas.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_parquet</span>
<span class="sd"> DataFrame.read_table</span>
<span class="sd"> DataFrame.read_delta</span>
<span class="sd"> DataFrame.read_spark_io</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, columns=[&#39;id&#39;])</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, columns=[&#39;id&#39;], index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_parquet`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">pandas_metadata</span><span class="p">:</span>
<span class="c1"># Try to read pandas metadata</span>
<span class="nd">@pandas_udf</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span>
<span class="s2">&quot;index_col array&lt;string&gt;, index_names array&lt;string&gt;&quot;</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">read_index_metadata</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">binary</span> <span class="o">=</span> <span class="n">pser</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">BufferReader</span><span class="p">(</span><span class="n">binary</span><span class="p">))</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">metadata</span>
<span class="k">if</span> <span class="sa">b</span><span class="s2">&quot;pandas&quot;</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="n">pandas_metadata</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">metadata</span><span class="p">[</span><span class="sa">b</span><span class="s2">&quot;pandas&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf8&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;index_columns&quot;</span><span class="p">]):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;index_columns&quot;</span><span class="p">]:</span>
<span class="n">index_col</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;columns&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">column</span><span class="p">[</span><span class="s2">&quot;field_name&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="n">col</span><span class="p">:</span>
<span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">[</span><span class="s2">&quot;name&quot;</span><span class="p">])</span>
<span class="k">break</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;index_col&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">index_col</span><span class="p">],</span> <span class="s2">&quot;index_names&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">index_names</span><span class="p">]})</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;index_col&quot;</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">],</span> <span class="s2">&quot;index_names&quot;</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">]})</span>
<span class="n">index_col</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">default_session</span><span class="p">()</span>
<span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">read_index_metadata</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;index_metadata&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;index_metadata.*&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="n">options</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">new_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_columns</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">index_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">index_names</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<div class="viewcode-block" id="read_clipboard"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_clipboard.html#pyspark.pandas.read_clipboard">[docs]</a><span class="k">def</span> <span class="nf">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;\s+&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read text from clipboard and pass to read_csv. See read_csv for the</span>
<span class="sd"> full argument list</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sep : str, default &#39;\s+&#39;</span>
<span class="sd"> A string or regex delimiter. The default of &#39;\s+&#39; denotes</span>
<span class="sd"> one or more whitespace characters.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_clipboard : Write text out to clipboard.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> parsed : DataFrame</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)))</span></div>
<div class="viewcode-block" id="read_excel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_excel.html#pyspark.pandas.read_excel">[docs]</a><span class="k">def</span> <span class="nf">read_excel</span><span class="p">(</span>
<span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span>
<span class="n">sheet_name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">true_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">false_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">date_parser</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">thousands</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipfooter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwds</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read an Excel file into a pandas-on-Spark DataFrame or Series.</span>
<span class="sd"> Support both `xls` and `xlsx` file extensions from a local filesystem or URL.</span>
<span class="sd"> Support an option to read a single sheet or a list of sheets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book</span>
<span class="sd"> The string could be a URL. The value URL must be available in Spark&#39;s DataFrameReader.</span>
<span class="sd"> .. note::</span>
<span class="sd"> If the underlying Spark is below 3.0, the parameter as a string is not supported.</span>
<span class="sd"> You can use `ps.from_pandas(pd.read_excel(...))` as a workaround.</span>
<span class="sd"> sheet_name : str, int, list, or None, default 0</span>
<span class="sd"> Strings are used for sheet names. Integers are used in zero-indexed</span>
<span class="sd"> sheet positions. Lists of strings/integers are used to request</span>
<span class="sd"> multiple sheets. Specify None to get all sheets.</span>
<span class="sd"> Available cases:</span>
<span class="sd"> * Defaults to ``0``: 1st sheet as a `DataFrame`</span>
<span class="sd"> * ``1``: 2nd sheet as a `DataFrame`</span>
<span class="sd"> * ``&quot;Sheet1&quot;``: Load sheet with name &quot;Sheet1&quot;</span>
<span class="sd"> * ``[0, 1, &quot;Sheet5&quot;]``: Load first, second and sheet named &quot;Sheet5&quot;</span>
<span class="sd"> as a dict of `DataFrame`</span>
<span class="sd"> * None: All sheets.</span>
<span class="sd"> header : int, list of int, default 0</span>
<span class="sd"> Row (0-indexed) to use for the column labels of the parsed</span>
<span class="sd"> DataFrame. If a list of integers is passed those row positions will</span>
<span class="sd"> be combined into a ``MultiIndex``. Use None if there is no header.</span>
<span class="sd"> names : array-like, default None</span>
<span class="sd"> List of column names to use. If file contains no header row,</span>
<span class="sd"> then you should explicitly pass header=None.</span>
<span class="sd"> index_col : int, list of int, default None</span>
<span class="sd"> Column (0-indexed) to use as the row labels of the DataFrame.</span>
<span class="sd"> Pass None if there is no such column. If a list is passed,</span>
<span class="sd"> those columns will be combined into a ``MultiIndex``. If a</span>
<span class="sd"> subset of data is selected with ``usecols``, index_col</span>
<span class="sd"> is based on the subset.</span>
<span class="sd"> usecols : int, str, list-like, or callable default None</span>
<span class="sd"> Return a subset of the columns.</span>
<span class="sd"> * If None, then parse all columns.</span>
<span class="sd"> * If str, then indicates comma separated list of Excel column letters</span>
<span class="sd"> and column ranges (e.g. &quot;A:E&quot; or &quot;A,C,E:F&quot;). Ranges are inclusive of</span>
<span class="sd"> both sides.</span>
<span class="sd"> * If list of int, then indicates list of column numbers to be parsed.</span>
<span class="sd"> * If list of string, then indicates list of column names to be parsed.</span>
<span class="sd"> * If callable, then evaluate each column name against it and parse the</span>
<span class="sd"> column if the callable returns ``True``.</span>
<span class="sd"> dtype : Type name or dict of column -&gt; type, default None</span>
<span class="sd"> Data type for data or columns. E.g. {&#39;a&#39;: np.float64, &#39;b&#39;: np.int32}</span>
<span class="sd"> Use `object` to preserve data as stored in Excel and not interpret dtype.</span>
<span class="sd"> If converters are specified, they will be applied INSTEAD</span>
<span class="sd"> of dtype conversion.</span>
<span class="sd"> engine : str, default None</span>
<span class="sd"> If io is not a buffer or path, this must be set to identify io.</span>
<span class="sd"> Acceptable values are None or xlrd.</span>
<span class="sd"> converters : dict, default None</span>
<span class="sd"> Dict of functions for converting values in certain columns. Keys can</span>
<span class="sd"> either be integers or column labels, values are functions that take one</span>
<span class="sd"> input argument, the Excel cell content, and return the transformed</span>
<span class="sd"> content.</span>
<span class="sd"> true_values : list, default None</span>
<span class="sd"> Values to consider as True.</span>
<span class="sd"> false_values : list, default None</span>
<span class="sd"> Values to consider as False.</span>
<span class="sd"> skiprows : list-like</span>
<span class="sd"> Rows to skip at the beginning (0-indexed).</span>
<span class="sd"> nrows : int, default None</span>
<span class="sd"> Number of rows to parse.</span>
<span class="sd"> na_values : scalar, str, list-like, or dict, default None</span>
<span class="sd"> Additional strings to recognize as NA/NaN. If dict passed, specific</span>
<span class="sd"> per-column NA values. By default the following values are interpreted</span>
<span class="sd"> as NaN.</span>
<span class="sd"> keep_default_na : bool, default True</span>
<span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span>
<span class="sd"> values are overridden, otherwise they&#39;re appended to.</span>
<span class="sd"> verbose : bool, default False</span>
<span class="sd"> Indicate number of NA values placed in non-numeric columns.</span>
<span class="sd"> parse_dates : bool, list-like, or dict, default False</span>
<span class="sd"> The behavior is as follows:</span>
<span class="sd"> * bool. If True -&gt; try parsing the index.</span>
<span class="sd"> * list of int or names. e.g. If [1, 2, 3] -&gt; try parsing columns 1, 2, 3</span>
<span class="sd"> each as a separate date column.</span>
<span class="sd"> * list of lists. e.g. If [[1, 3]] -&gt; combine columns 1 and 3 and parse as</span>
<span class="sd"> a single date column.</span>
<span class="sd"> * dict, e.g. {{&#39;foo&#39; : [1, 3]}} -&gt; parse columns 1, 3 as date and call</span>
<span class="sd"> result &#39;foo&#39;</span>
<span class="sd"> If a column or index contains an unparseable date, the entire column or</span>
<span class="sd"> index will be returned unaltered as an object data type. For non-standard</span>
<span class="sd"> datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``</span>
<span class="sd"> Note: A fast-path exists for iso8601-formatted dates.</span>
<span class="sd"> date_parser : function, optional</span>
<span class="sd"> Function to use for converting a sequence of string columns to an array of</span>
<span class="sd"> datetime instances. The default uses ``dateutil.parser.parser`` to do the</span>
<span class="sd"> conversion. pandas-on-Spark will try to call `date_parser` in three different ways,</span>
<span class="sd"> advancing to the next if an exception occurs: 1) Pass one or more arrays</span>
<span class="sd"> (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the</span>
<span class="sd"> string values from the columns defined by `parse_dates` into a single array</span>
<span class="sd"> and pass that; and 3) call `date_parser` once for each row using one or</span>
<span class="sd"> more strings (corresponding to the columns defined by `parse_dates`) as</span>
<span class="sd"> arguments.</span>
<span class="sd"> thousands : str, default None</span>
<span class="sd"> Thousands separator for parsing string columns to numeric. Note that</span>
<span class="sd"> this parameter is only necessary for columns stored as TEXT in Excel,</span>
<span class="sd"> any numeric columns will automatically be parsed, regardless of display</span>
<span class="sd"> format.</span>
<span class="sd"> comment : str, default None</span>
<span class="sd"> Comments out remainder of line. Pass a character or characters to this</span>
<span class="sd"> argument to indicate comments in the input file. Any data between the</span>
<span class="sd"> comment string and the end of the current line is ignored.</span>
<span class="sd"> skipfooter : int, default 0</span>
<span class="sd"> Rows at the end to skip (0-indexed).</span>
<span class="sd"> **kwds : optional</span>
<span class="sd"> Optional keyword arguments can be passed to ``TextFileReader``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or dict of DataFrames</span>
<span class="sd"> DataFrame from the passed in Excel file. See notes in sheet_name</span>
<span class="sd"> argument for more information on when a dict of DataFrames is returned.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_excel : Write DataFrame to an Excel file.</span>
<span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span>
<span class="sd"> read_csv : Read a comma-separated values (csv) file into DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> The file can be read using the file name as string or an open file object:</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1</span>
<span class="sd"> 1 string2 2</span>
<span class="sd"> 2 #Comment 3</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(open(&#39;tmp.xlsx&#39;, &#39;rb&#39;),</span>
<span class="sd"> ... sheet_name=&#39;Sheet3&#39;) # doctest: +SKIP</span>
<span class="sd"> Unnamed: 0 Name Value</span>
<span class="sd"> 0 0 string1 1</span>
<span class="sd"> 1 1 string2 2</span>
<span class="sd"> 2 2 #Comment 3</span>
<span class="sd"> Index and header can be specified via the `index_col` and `header` arguments</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=None, header=None) # doctest: +SKIP</span>
<span class="sd"> 0 1 2</span>
<span class="sd"> 0 NaN Name Value</span>
<span class="sd"> 1 0.0 string1 1</span>
<span class="sd"> 2 1.0 string2 2</span>
<span class="sd"> 3 2.0 #Comment 3</span>
<span class="sd"> Column types are inferred but can be explicitly specified</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0,</span>
<span class="sd"> ... dtype={&#39;Name&#39;: str, &#39;Value&#39;: float}) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1.0</span>
<span class="sd"> 1 string2 2.0</span>
<span class="sd"> 2 #Comment 3.0</span>
<span class="sd"> True, False, and NA values, and thousands separators have defaults,</span>
<span class="sd"> but can be explicitly specified, too. Supply the values you would like</span>
<span class="sd"> as strings or lists of strings!</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0,</span>
<span class="sd"> ... na_values=[&#39;string1&#39;, &#39;string2&#39;]) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 None 1</span>
<span class="sd"> 1 None 2</span>
<span class="sd"> 2 #Comment 3</span>
<span class="sd"> Comment lines in the excel input file can be skipped using the `comment` kwarg</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0, comment=&#39;#&#39;) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1.0</span>
<span class="sd"> 1 string2 2.0</span>
<span class="sd"> 2 None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">pd_read_excel</span><span class="p">(</span>
<span class="n">io_or_bin</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span>
<span class="n">io</span><span class="o">=</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="p">(</span><span class="nb">bytes</span><span class="p">,</span> <span class="nb">bytearray</span><span class="p">))</span> <span class="k">else</span> <span class="n">io_or_bin</span><span class="p">,</span>
<span class="n">sheet_name</span><span class="o">=</span><span class="n">sn</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span>
<span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span>
<span class="n">usecols</span><span class="o">=</span><span class="n">usecols</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">engine</span><span class="o">=</span><span class="n">engine</span><span class="p">,</span>
<span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span>
<span class="n">true_values</span><span class="o">=</span><span class="n">true_values</span><span class="p">,</span>
<span class="n">false_values</span><span class="o">=</span><span class="n">false_values</span><span class="p">,</span>
<span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span>
<span class="n">nrows</span><span class="o">=</span><span class="n">nrows</span><span class="p">,</span>
<span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span>
<span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="n">date_parser</span><span class="o">=</span><span class="n">date_parser</span><span class="p">,</span>
<span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span>
<span class="n">comment</span><span class="o">=</span><span class="n">comment</span><span class="p">,</span>
<span class="n">skipfooter</span><span class="o">=</span><span class="n">skipfooter</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwds</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="c1"># &#39;binaryFile&#39; format is available since Spark 3.0.0.</span>
<span class="n">binaries</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">binaries</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">binaries</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">io</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">pdf_or_psers</span> <span class="o">=</span> <span class="n">pd_read_excel</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sheet_name</span><span class="p">)</span>
<span class="k">if</span> <span class="n">single_file</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">))</span>
<span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">read_excel_on_spark</span><span class="p">(</span>
<span class="n">pdf_or_pser</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">))</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span>
<span class="n">as_nullable_spark_type</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">output_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">pd_read_excel</span><span class="p">(</span><span class="nb">bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sn</span><span class="p">)</span> <span class="k">for</span> <span class="nb">bin</span> <span class="ow">in</span> <span class="n">pdf</span><span class="p">[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]])</span>
<span class="n">reset_index</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">reset_index</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">dt</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">dtype</span>
<span class="k">if</span> <span class="n">is_datetime64_dtype</span><span class="p">(</span><span class="n">dt</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dt</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DatetimeTZDtype</span><span class="p">):</span>
<span class="k">continue</span>
<span class="n">reset_index</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">replace</span><span class="p">({</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">:</span> <span class="kc">None</span><span class="p">})</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">reset_index</span>
<span class="c1"># Just positionally map the column names to given schema&#39;s.</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span><span class="p">)))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">default_session</span><span class="p">()</span>
<span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="k">lambda</span> <span class="n">iterator</span><span class="p">:</span> <span class="nb">map</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">iterator</span><span class="p">),</span> <span class="n">schema</span><span class="o">=</span><span class="n">return_schema</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">sn</span><span class="p">)</span> <span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="n">sheet_name</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_html"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_html.html#pyspark.pandas.read_html">[docs]</a><span class="k">def</span> <span class="nf">read_html</span><span class="p">(</span>
<span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span>
<span class="n">match</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;.+&quot;</span><span class="p">,</span>
<span class="n">flavor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="nb">slice</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">attrs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">thousands</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span><span class="p">,</span>
<span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">displayed_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Read HTML tables into a ``list`` of ``DataFrame`` objects.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> io : str or file-like</span>
<span class="sd"> A URL, a file-like object, or a raw string containing HTML. Note that</span>
<span class="sd"> lxml only accepts the http, FTP and file URL protocols. If you have a</span>
<span class="sd"> URL that starts with ``&#39;https&#39;`` you might try removing the ``&#39;s&#39;``.</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> Passing html literal strings is deprecated.</span>
<span class="sd"> Wrap literal string/bytes input in io.StringIO/io.BytesIO instead.</span>
<span class="sd"> match : str or compiled regular expression, optional</span>
<span class="sd"> The set of tables containing text matching this regex or string will be</span>
<span class="sd"> returned. Unless the HTML is extremely simple you will probably need to</span>
<span class="sd"> pass a non-empty string here. Defaults to &#39;.+&#39; (match any non-empty</span>
<span class="sd"> string). The default value will return all tables contained on a page.</span>
<span class="sd"> This value is converted to a regular expression so that there is</span>
<span class="sd"> consistent behavior between Beautiful Soup and lxml.</span>
<span class="sd"> flavor : str or None, container of strings</span>
<span class="sd"> The parsing engine to use. &#39;bs4&#39; and &#39;html5lib&#39; are synonymous with</span>
<span class="sd"> each other, they are both there for backwards compatibility. The</span>
<span class="sd"> default of ``None`` tries to use ``lxml`` to parse and if that fails it</span>
<span class="sd"> falls back on ``bs4`` + ``html5lib``.</span>
<span class="sd"> header : int or list-like or None, optional</span>
<span class="sd"> The row (or list of rows for a :class:`~ps.MultiIndex`) to use to</span>
<span class="sd"> make the columns headers.</span>
<span class="sd"> index_col : int or list-like or None, optional</span>
<span class="sd"> The column (or list of columns) to use to create the index.</span>
<span class="sd"> skiprows : int or list-like or slice or None, optional</span>
<span class="sd"> 0-based. Number of rows to skip after parsing the column integer. If a</span>
<span class="sd"> sequence of integers or a slice is given, will skip the rows indexed by</span>
<span class="sd"> that sequence. Note that a single element sequence means &#39;skip the nth</span>
<span class="sd"> row&#39; whereas an integer means &#39;skip n rows&#39;.</span>
<span class="sd"> attrs : dict or None, optional</span>
<span class="sd"> This is a dictionary of attributes that you can pass to use to identify</span>
<span class="sd"> the table in the HTML. These are not checked for validity before being</span>
<span class="sd"> passed to lxml or Beautiful Soup. However, these attributes must be</span>
<span class="sd"> valid HTML table attributes to work correctly. For example, ::</span>
<span class="sd"> attrs = {&#39;id&#39;: &#39;table&#39;}</span>
<span class="sd"> is a valid attribute dictionary because the &#39;id&#39; HTML tag attribute is</span>
<span class="sd"> a valid HTML attribute for *any* HTML tag as per `this document</span>
<span class="sd"> &lt;http://www.w3.org/TR/html-markup/global-attributes.html&gt;`__. ::</span>
<span class="sd"> attrs = {&#39;asdf&#39;: &#39;table&#39;}</span>
<span class="sd"> is *not* a valid attribute dictionary because &#39;asdf&#39; is not a valid</span>
<span class="sd"> HTML attribute even if it is a valid XML attribute. Valid HTML 4.01</span>
<span class="sd"> table attributes can be found `here</span>
<span class="sd"> &lt;http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2&gt;`__. A</span>
<span class="sd"> working draft of the HTML 5 spec can be found `here</span>
<span class="sd"> &lt;http://www.w3.org/TR/html-markup/table.html&gt;`__. It contains the</span>
<span class="sd"> latest information on table attributes for the modern web.</span>
<span class="sd"> parse_dates : bool, optional</span>
<span class="sd"> See :func:`~ps.read_csv` for more details.</span>
<span class="sd"> thousands : str, optional</span>
<span class="sd"> Separator to use to parse thousands. Defaults to ``&#39;,&#39;``.</span>
<span class="sd"> encoding : str or None, optional</span>
<span class="sd"> The encoding used to decode the web page. Defaults to ``None``.``None``</span>
<span class="sd"> preserves the previous encoding behavior, which depends on the</span>
<span class="sd"> underlying parser library (e.g., the parser library will try to use</span>
<span class="sd"> the encoding provided by the document).</span>
<span class="sd"> decimal : str, default &#39;.&#39;</span>
<span class="sd"> Character to recognize as decimal point (example: use &#39;,&#39; for European</span>
<span class="sd"> data).</span>
<span class="sd"> converters : dict, default None</span>
<span class="sd"> Dict of functions for converting values in certain columns. Keys can</span>
<span class="sd"> either be integers or column labels, values are functions that take one</span>
<span class="sd"> input argument, the cell (not column) content, and return the</span>
<span class="sd"> transformed content.</span>
<span class="sd"> na_values : iterable, default None</span>
<span class="sd"> Custom NA values</span>
<span class="sd"> keep_default_na : bool, default True</span>
<span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span>
<span class="sd"> values are overridden, otherwise they&#39;re appended to</span>
<span class="sd"> displayed_only : bool, default True</span>
<span class="sd"> Whether elements with &quot;display: none&quot; should be parsed</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dfs : list of DataFrames</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_csv</span>
<span class="sd"> DataFrame.to_html</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">pdfs</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_html</span><span class="p">(</span>
<span class="n">io</span><span class="o">=</span><span class="n">io</span><span class="p">,</span>
<span class="n">match</span><span class="o">=</span><span class="n">match</span><span class="p">,</span>
<span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span>
<span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span>
<span class="n">attrs</span><span class="o">=</span><span class="n">attrs</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span>
<span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span>
<span class="n">encoding</span><span class="o">=</span><span class="n">encoding</span><span class="p">,</span>
<span class="n">decimal</span><span class="o">=</span><span class="n">decimal</span><span class="p">,</span>
<span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span>
<span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span>
<span class="n">displayed_only</span><span class="o">=</span><span class="n">displayed_only</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">],</span> <span class="p">[</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="k">for</span> <span class="n">pdf</span> <span class="ow">in</span> <span class="n">pdfs</span><span class="p">])</span></div>
<span class="c1"># TODO: add `coerce_float` and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_table.html#pyspark.pandas.read_sql_table">[docs]</a><span class="k">def</span> <span class="nf">read_sql_table</span><span class="p">(</span>
<span class="n">table_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read SQL database table into a DataFrame.</span>
<span class="sd"> Given a table name and a JDBC URI, returns a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table_name : str</span>
<span class="sd"> Name of SQL table in database.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> schema : str, default None</span>
<span class="sd"> Name of SQL schema in database to query (if database flavor</span>
<span class="sd"> supports this). Uses default schema if None (default).</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> columns : list, default None</span>
<span class="sd"> List of column names to select from SQL table.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> A SQL table is returned as two-dimensional data structure with labeled</span>
<span class="sd"> axes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span>
<span class="sd"> read_sql : Read SQL query or database table into a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql_table(&#39;table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;dbtable&quot;</span><span class="p">,</span> <span class="n">table_name</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;url&quot;</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;jdbc&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<span class="c1"># TODO: add `coerce_float`, `params`, and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql_query"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_query.html#pyspark.pandas.read_sql_query">[docs]</a><span class="k">def</span> <span class="nf">read_sql_query</span><span class="p">(</span>
<span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Read SQL query into a DataFrame.</span>
<span class="sd"> Returns a DataFrame corresponding to the result set of the query</span>
<span class="sd"> string. Optionally provide an `index_col` parameter to use one of the</span>
<span class="sd"> columns as the index, otherwise default index will be used.</span>
<span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sql : string SQL query</span>
<span class="sd"> SQL query to be executed.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> index_col : string or list of strings, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span>
<span class="sd"> read_sql</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql_query(&#39;SELECT * FROM table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">sql</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;url&quot;</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;jdbc&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<span class="c1"># TODO: add `coerce_float`, `params`, and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql.html#pyspark.pandas.read_sql">[docs]</a><span class="k">def</span> <span class="nf">read_sql</span><span class="p">(</span>
<span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read SQL query or database table into a DataFrame.</span>
<span class="sd"> This function is a convenience wrapper around ``read_sql_table`` and</span>
<span class="sd"> ``read_sql_query`` (for backward compatibility). It will delegate</span>
<span class="sd"> to the specific function depending on the provided input. A SQL query</span>
<span class="sd"> will be routed to ``read_sql_query``, while a database table name will</span>
<span class="sd"> be routed to ``read_sql_table``. Note that the delegated function might</span>
<span class="sd"> have more specific notes about their functionality not listed here.</span>
<span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sql : string</span>
<span class="sd"> SQL query to be executed or a table name.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> index_col : string or list of strings, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> columns : list, default: None</span>
<span class="sd"> List of column names to select from SQL table (only used when reading</span>
<span class="sd"> a table).</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span>
<span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql(&#39;table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql(&#39;SELECT * FROM table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">striped</span> <span class="o">=</span> <span class="n">sql</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">if</span> <span class="s2">&quot; &quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">striped</span><span class="p">:</span> <span class="c1"># TODO: identify the table name or not more precisely.</span>
<span class="k">return</span> <span class="n">read_sql_table</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">read_sql_query</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_datetime"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_datetime.html#pyspark.pandas.to_datetime">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_datetime</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;raise&quot;</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">origin</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;unix&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to datetime.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : integer, float, string, datetime, list, tuple, 1-d array, Series</span>
<span class="sd"> or DataFrame/dict-like</span>
<span class="sd"> errors : {&#39;ignore&#39;, &#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> - If &#39;raise&#39;, then invalid parsing will raise an exception</span>
<span class="sd"> - If &#39;coerce&#39;, then invalid parsing will be set as NaT</span>
<span class="sd"> - If &#39;ignore&#39;, then invalid parsing will return the input</span>
<span class="sd"> format : string, default None</span>
<span class="sd"> strftime to parse time, eg &quot;%d/%m/%Y&quot;, note that &quot;%f&quot; will parse</span>
<span class="sd"> all the way up to nanoseconds.</span>
<span class="sd"> unit : string, default None</span>
<span class="sd"> unit of the arg (D,s,ms,us,ns) denote the unit, which is an</span>
<span class="sd"> integer or float number. This will be based off the origin.</span>
<span class="sd"> Example, with unit=&#39;ms&#39; and origin=&#39;unix&#39; (the default), this</span>
<span class="sd"> would calculate the number of milliseconds to the unix epoch start.</span>
<span class="sd"> infer_datetime_format : boolean, default False</span>
<span class="sd"> If True and no `format` is given, attempt to infer the format of the</span>
<span class="sd"> datetime strings, and if it can be inferred, switch to a faster</span>
<span class="sd"> method of parsing them. In some cases this can increase the parsing</span>
<span class="sd"> speed by ~5-10x.</span>
<span class="sd"> origin : scalar, default &#39;unix&#39;</span>
<span class="sd"> Define the reference date. The numeric values would be parsed as number</span>
<span class="sd"> of units (defined by `unit`) since this reference date.</span>
<span class="sd"> - If &#39;unix&#39; (or POSIX) time; origin is set to 1970-01-01.</span>
<span class="sd"> - If &#39;julian&#39;, unit must be &#39;D&#39;, and origin is set to beginning of</span>
<span class="sd"> Julian Calendar. Julian day number 0 is assigned to the day starting</span>
<span class="sd"> at noon on January 1, 4713 BC.</span>
<span class="sd"> - If Timestamp convertible, origin is set to Timestamp identified by</span>
<span class="sd"> origin.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : datetime if parsing succeeded.</span>
<span class="sd"> Return type depends on input:</span>
<span class="sd"> - list-like: DatetimeIndex</span>
<span class="sd"> - Series: Series of datetime64 dtype</span>
<span class="sd"> - scalar: Timestamp</span>
<span class="sd"> In case when it is not possible to return designated types (e.g. when</span>
<span class="sd"> any element of input is before Timestamp.min or after Timestamp.max)</span>
<span class="sd"> return will have datetime.datetime type (or corresponding</span>
<span class="sd"> array/Series).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Assembling a datetime from multiple columns of a DataFrame. The keys can be</span>
<span class="sd"> common abbreviations like [&#39;year&#39;, &#39;month&#39;, &#39;day&#39;, &#39;minute&#39;, &#39;second&#39;,</span>
<span class="sd"> &#39;ms&#39;, &#39;us&#39;, &#39;ns&#39;]) or plurals of the same</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;year&#39;: [2015, 2016],</span>
<span class="sd"> ... &#39;month&#39;: [2, 3],</span>
<span class="sd"> ... &#39;day&#39;: [4, 5]})</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(df)</span>
<span class="sd"> 0 2015-02-04</span>
<span class="sd"> 1 2016-03-05</span>
<span class="sd"> dtype: datetime64[ns]</span>
<span class="sd"> If a date does not meet the `timestamp limitations</span>
<span class="sd"> &lt;http://pandas.pydata.org/pandas-docs/stable/timeseries.html</span>
<span class="sd"> #timeseries-timestamp-limits&gt;`_, passing errors=&#39;ignore&#39;</span>
<span class="sd"> will return the original input instead of raising any exception.</span>
<span class="sd"> Passing errors=&#39;coerce&#39; will force an out-of-bounds date to NaT,</span>
<span class="sd"> in addition to forcing non-dates (or non-parseable dates) to NaT.</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(&#39;13000101&#39;, format=&#39;%Y%m%d&#39;, errors=&#39;ignore&#39;) # doctest: +SKIP</span>
<span class="sd"> datetime.datetime(1300, 1, 1, 0, 0)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(&#39;13000101&#39;, format=&#39;%Y%m%d&#39;, errors=&#39;coerce&#39;)</span>
<span class="sd"> NaT</span>
<span class="sd"> Passing infer_datetime_format=True can often-times speedup a parsing</span>
<span class="sd"> if its not an ISO8601 format exactly, but in a regular format.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([&#39;3/11/2000&#39;, &#39;3/12/2000&#39;, &#39;3/13/2000&#39;] * 1000)</span>
<span class="sd"> &gt;&gt;&gt; s.head()</span>
<span class="sd"> 0 3/11/2000</span>
<span class="sd"> 1 3/12/2000</span>
<span class="sd"> 2 3/13/2000</span>
<span class="sd"> 3 3/11/2000</span>
<span class="sd"> 4 3/12/2000</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; import timeit</span>
<span class="sd"> &gt;&gt;&gt; timeit.timeit(</span>
<span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=True)),</span>
<span class="sd"> ... number = 1) # doctest: +SKIP</span>
<span class="sd"> 0.35832712500000063</span>
<span class="sd"> &gt;&gt;&gt; timeit.timeit(</span>
<span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=False)),</span>
<span class="sd"> ... number = 1) # doctest: +SKIP</span>
<span class="sd"> 0.8895321660000004</span>
<span class="sd"> Using a unix epoch time</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(1490195805, unit=&#39;s&#39;)</span>
<span class="sd"> Timestamp(&#39;2017-03-22 15:16:45&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(1490195805433502912, unit=&#39;ns&#39;)</span>
<span class="sd"> Timestamp(&#39;2017-03-22 15:16:45.433502912&#39;)</span>
<span class="sd"> Using a non-unix epoch origin</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime([1, 2, 3], unit=&#39;D&#39;, origin=pd.Timestamp(&#39;1960-01-01&#39;))</span>
<span class="sd"> DatetimeIndex([&#39;1960-01-02&#39;, &#39;1960-01-03&#39;, &#39;1960-01-04&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># mappings for assembling units</span>
<span class="c1"># From pandas: pandas.core.tools.datetimes</span>
<span class="n">_unit_map</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;year&quot;</span><span class="p">:</span> <span class="s2">&quot;year&quot;</span><span class="p">,</span>
<span class="s2">&quot;years&quot;</span><span class="p">:</span> <span class="s2">&quot;year&quot;</span><span class="p">,</span>
<span class="s2">&quot;month&quot;</span><span class="p">:</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span>
<span class="s2">&quot;months&quot;</span><span class="p">:</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span>
<span class="s2">&quot;day&quot;</span><span class="p">:</span> <span class="s2">&quot;day&quot;</span><span class="p">,</span>
<span class="s2">&quot;days&quot;</span><span class="p">:</span> <span class="s2">&quot;day&quot;</span><span class="p">,</span>
<span class="s2">&quot;hour&quot;</span><span class="p">:</span> <span class="s2">&quot;h&quot;</span><span class="p">,</span>
<span class="s2">&quot;hours&quot;</span><span class="p">:</span> <span class="s2">&quot;h&quot;</span><span class="p">,</span>
<span class="s2">&quot;minute&quot;</span><span class="p">:</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span>
<span class="s2">&quot;minutes&quot;</span><span class="p">:</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span>
<span class="s2">&quot;second&quot;</span><span class="p">:</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span>
<span class="s2">&quot;seconds&quot;</span><span class="p">:</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span>
<span class="s2">&quot;ms&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;millisecond&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;milliseconds&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;us&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="s2">&quot;microsecond&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="s2">&quot;microseconds&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">def</span> <span class="nf">pandas_to_datetime</span><span class="p">(</span>
<span class="n">pser_or_pdf</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">datetime64</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">pser_or_pdf</span><span class="p">[</span><span class="n">cols</span><span class="p">]</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span>
<span class="n">pser_or_pdf</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span>
<span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="n">unit</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">_unit_map</span><span class="p">[</span><span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()]</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">arg</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span> <span class="k">if</span> <span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="ow">in</span> <span class="n">_unit_map</span><span class="p">}</span>
<span class="n">unit_rev</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">unit</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="n">list_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;month&quot;</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;day&quot;</span><span class="p">]]</span>
<span class="k">for</span> <span class="n">u</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;h&quot;</span><span class="p">,</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span> <span class="s2">&quot;us&quot;</span><span class="p">]:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">unit_rev</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">u</span><span class="p">)</span>
<span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">arg</span><span class="p">:</span>
<span class="n">list_cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">arg</span><span class="p">[</span><span class="n">list_cols</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">,</span> <span class="n">list_cols</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span>
<span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="date_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.date_range.html#pyspark.pandas.date_range">[docs]</a><span class="k">def</span> <span class="nf">date_range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tz</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">tzinfo</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">normalize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inclusive</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;both&quot;</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DatetimeIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a fixed frequency DatetimeIndex.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : str or datetime-like, optional</span>
<span class="sd"> Left bound for generating dates.</span>
<span class="sd"> end : str or datetime-like, optional</span>
<span class="sd"> Right bound for generating dates.</span>
<span class="sd"> periods : int, optional</span>
<span class="sd"> Number of periods to generate.</span>
<span class="sd"> freq : str or DateOffset, default &#39;D&#39;</span>
<span class="sd"> Frequency strings can have multiples, e.g. &#39;5H&#39;.</span>
<span class="sd"> tz : str or tzinfo, optional</span>
<span class="sd"> Time zone name for returning localized DatetimeIndex, for example</span>
<span class="sd"> &#39;Asia/Hong_Kong&#39;. By default, the resulting DatetimeIndex is</span>
<span class="sd"> time zone naive.</span>
<span class="sd"> normalize : bool, default False</span>
<span class="sd"> Normalize start/end dates to midnight before generating date range.</span>
<span class="sd"> name : str, default None</span>
<span class="sd"> Name of the resulting DatetimeIndex.</span>
<span class="sd"> inclusive : {&quot;both&quot;, &quot;neither&quot;, &quot;left&quot;, &quot;right&quot;}, default &quot;both&quot;</span>
<span class="sd"> Include boundaries; Whether to set each bound as closed or open.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> **kwargs</span>
<span class="sd"> For compatibility. Has no effect on the result.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> rng : DatetimeIndex</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DatetimeIndex : An immutable container for datetimes.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span>
<span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span>
<span class="sd"> ``DatetimeIndex`` will have ``periods`` linearly spaced elements between</span>
<span class="sd"> ``start`` and ``end`` (closed on both sides).</span>
<span class="sd"> To learn more about the frequency strings, please see `this link</span>
<span class="sd"> &lt;https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases&gt;`__.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> **Specifying the values**</span>
<span class="sd"> The next four examples generate the same `DatetimeIndex`, but vary</span>
<span class="sd"> the combination of `start`, `end` and `periods`.</span>
<span class="sd"> Specify `start` and `end`, with the default daily frequency.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, end=&#39;1/08/2018&#39;) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-01&#39;, &#39;2018-01-02&#39;, &#39;2018-01-03&#39;, &#39;2018-01-04&#39;,</span>
<span class="sd"> &#39;2018-01-05&#39;, &#39;2018-01-06&#39;, &#39;2018-01-07&#39;, &#39;2018-01-08&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `start` and `periods`, the number of periods (days).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=8) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-01&#39;, &#39;2018-01-02&#39;, &#39;2018-01-03&#39;, &#39;2018-01-04&#39;,</span>
<span class="sd"> &#39;2018-01-05&#39;, &#39;2018-01-06&#39;, &#39;2018-01-07&#39;, &#39;2018-01-08&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `end` and `periods`, the number of periods (days).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(end=&#39;1/1/2018&#39;, periods=8) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2017-12-25&#39;, &#39;2017-12-26&#39;, &#39;2017-12-27&#39;, &#39;2017-12-28&#39;,</span>
<span class="sd"> &#39;2017-12-29&#39;, &#39;2017-12-30&#39;, &#39;2017-12-31&#39;, &#39;2018-01-01&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `start`, `end`, and `periods`; the frequency is generated</span>
<span class="sd"> automatically (linearly spaced).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2018-04-24&#39;, end=&#39;2018-04-27&#39;, periods=3</span>
<span class="sd"> ... ) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-04-24 00:00:00&#39;, &#39;2018-04-25 12:00:00&#39;,</span>
<span class="sd"> &#39;2018-04-27 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> **Other Parameters**</span>
<span class="sd"> Changed the `freq` (frequency) to ``&#39;M&#39;`` (month end frequency).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=5, freq=&#39;M&#39;) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-02-28&#39;, &#39;2018-03-31&#39;, &#39;2018-04-30&#39;,</span>
<span class="sd"> &#39;2018-05-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Multiples are allowed</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=5, freq=&#39;3M&#39;) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-04-30&#39;, &#39;2018-07-31&#39;, &#39;2018-10-31&#39;,</span>
<span class="sd"> &#39;2019-01-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> `freq` can also be specified as an Offset object.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;1/1/2018&#39;, periods=5, freq=pd.offsets.MonthEnd(3)</span>
<span class="sd"> ... ) # doctest: +SKIP</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-04-30&#39;, &#39;2018-07-31&#39;, &#39;2018-10-31&#39;,</span>
<span class="sd"> &#39;2019-01-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> `inclusive` controls whether to include `start` and `end` that are on the</span>
<span class="sd"> boundary. The default includes boundary points on either end.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, inclusive=&quot;both&quot;</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-01&#39;, &#39;2017-01-02&#39;, &#39;2017-01-03&#39;, &#39;2017-01-04&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Use ``inclusive=&#39;left&#39;`` to exclude `end` if it falls on the boundary.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, inclusive=&#39;left&#39;</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-01&#39;, &#39;2017-01-02&#39;, &#39;2017-01-03&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Use ``inclusive=&#39;right&#39;`` to exclude `start` if it falls on the boundary.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, inclusive=&#39;right&#39;</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-02&#39;, &#39;2017-01-03&#39;, &#39;2017-01-04&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;N&quot;</span><span class="p">,</span> <span class="s2">&quot;ns&quot;</span><span class="p">],</span> <span class="s2">&quot;nanoseconds is not supported&quot;</span>
<span class="k">assert</span> <span class="n">tz</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">&quot;Localized DatetimeIndex is not supported&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">DatetimeIndex</span><span class="p">,</span>
<span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span>
<span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span>
<span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span>
<span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span>
<span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span>
<span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span>
<span class="n">tz</span><span class="o">=</span><span class="n">tz</span><span class="p">,</span>
<span class="n">normalize</span><span class="o">=</span><span class="n">normalize</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span>
<span class="n">inclusive</span><span class="o">=</span><span class="n">inclusive</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="to_timedelta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_timedelta.html#pyspark.pandas.to_timedelta">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;raise&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to timedelta.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : str, timedelta, list-like or Series</span>
<span class="sd"> The data to be converted to timedelta.</span>
<span class="sd"> unit : str, optional</span>
<span class="sd"> Denotes the unit of the arg for numeric `arg`. Defaults to ``&quot;ns&quot;``.</span>
<span class="sd"> Possible values:</span>
<span class="sd"> * &#39;W&#39;</span>
<span class="sd"> * &#39;D&#39; / &#39;days&#39; / &#39;day&#39;</span>
<span class="sd"> * &#39;hours&#39; / &#39;hour&#39; / &#39;hr&#39; / &#39;h&#39;</span>
<span class="sd"> * &#39;m&#39; / &#39;minute&#39; / &#39;min&#39; / &#39;minutes&#39; / &#39;T&#39;</span>
<span class="sd"> * &#39;S&#39; / &#39;seconds&#39; / &#39;sec&#39; / &#39;second&#39;</span>
<span class="sd"> * &#39;ms&#39; / &#39;milliseconds&#39; / &#39;millisecond&#39; / &#39;milli&#39; / &#39;millis&#39; / &#39;L&#39;</span>
<span class="sd"> * &#39;us&#39; / &#39;microseconds&#39; / &#39;microsecond&#39; / &#39;micro&#39; / &#39;micros&#39; / &#39;U&#39;</span>
<span class="sd"> * &#39;ns&#39; / &#39;nanoseconds&#39; / &#39;nano&#39; / &#39;nanos&#39; / &#39;nanosecond&#39; / &#39;N&#39;</span>
<span class="sd"> Must not be specified when `arg` context strings and ``errors=&quot;raise&quot;``.</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> Units &#39;T&#39; and &#39;L&#39; are deprecated and will be removed in a future version.</span>
<span class="sd"> errors : {&#39;ignore&#39;, &#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> - If &#39;raise&#39;, then invalid parsing will raise an exception.</span>
<span class="sd"> - If &#39;coerce&#39;, then invalid parsing will be set as NaT.</span>
<span class="sd"> - If &#39;ignore&#39;, then invalid parsing will return the input.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : timedelta64, TimedeltaIndex or Series of timedelta64 if parsing succeeded.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> If the precision is higher than nanoseconds, the precision of the duration is</span>
<span class="sd"> truncated to nanoseconds for string inputs.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Parsing a single string to a Timedelta:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(&#39;1 days 06:05:01.00003&#39;)</span>
<span class="sd"> Timedelta(&#39;1 days 06:05:01.000030&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(&#39;15.5us&#39;) # doctest: +SKIP</span>
<span class="sd"> Timedelta(&#39;0 days 00:00:00.000015500&#39;)</span>
<span class="sd"> Parsing a list or array of strings:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta([&#39;1 days 06:05:01.00003&#39;, &#39;15.5us&#39;, &#39;nan&#39;]) # doctest: +SKIP</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 06:05:01.000030&#39;, &#39;0 days 00:00:00.000015500&#39;, NaT],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> Converting numbers by specifying the `unit` keyword argument:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(np.arange(5), unit=&#39;s&#39;) # doctest: +SKIP</span>
<span class="sd"> TimedeltaIndex([&#39;0 days 00:00:00&#39;, &#39;0 days 00:00:01&#39;, &#39;0 days 00:00:02&#39;,</span>
<span class="sd"> &#39;0 days 00:00:03&#39;, &#39;0 days 00:00:04&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(np.arange(5), unit=&#39;d&#39;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;0 days&#39;, &#39;1 days&#39;, &#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">pandas_to_timedelta</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="o">=</span><span class="n">pser</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">pandas_to_timedelta</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="o">=</span><span class="n">arg</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="timedelta_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.timedelta_range.html#pyspark.pandas.timedelta_range">[docs]</a><span class="k">def</span> <span class="nf">timedelta_range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">closed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">TimedeltaIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a fixed frequency TimedeltaIndex, with day as the default frequency.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : str or timedelta-like, optional</span>
<span class="sd"> Left bound for generating timedeltas.</span>
<span class="sd"> end : str or timedelta-like, optional</span>
<span class="sd"> Right bound for generating timedeltas.</span>
<span class="sd"> periods : int, optional</span>
<span class="sd"> Number of periods to generate.</span>
<span class="sd"> freq : str or DateOffset, default &#39;D&#39;</span>
<span class="sd"> Frequency strings can have multiples, e.g. &#39;5H&#39;.</span>
<span class="sd"> name : str, default None</span>
<span class="sd"> Name of the resulting TimedeltaIndex.</span>
<span class="sd"> closed : {None, &#39;left&#39;, &#39;right&#39;}, optional</span>
<span class="sd"> Make the interval closed with respect to the given frequency to</span>
<span class="sd"> the &#39;left&#39;, &#39;right&#39;, or both sides (None, the default).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> TimedeltaIndex</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span>
<span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span>
<span class="sd"> ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between</span>
<span class="sd"> ``start`` and ``end`` (closed on both sides).</span>
<span class="sd"> To learn more about the frequency strings, please see `this link</span>
<span class="sd"> &lt;https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases&gt;`__.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, periods=4) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days&#39;, &#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;], dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> The closed parameter specifies which endpoint is included.</span>
<span class="sd"> The default behavior is to include both endpoints.</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, periods=4, closed=&#39;right&#39;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;], dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> The freq parameter specifies the frequency of the TimedeltaIndex.</span>
<span class="sd"> Only fixed frequencies can be passed, non-fixed frequencies such as ‘M’ (month end) will raise.</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, end=&#39;2 days&#39;, freq=&#39;6H&#39;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 00:00:00&#39;, &#39;1 days 06:00:00&#39;, &#39;1 days 12:00:00&#39;,</span>
<span class="sd"> &#39;1 days 18:00:00&#39;, &#39;2 days 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify start, end, and periods; the frequency is generated automatically (linearly spaced).</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, end=&#39;5 days&#39;, periods=4)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 00:00:00&#39;, &#39;2 days 08:00:00&#39;, &#39;3 days 16:00:00&#39;,</span>
<span class="sd"> &#39;5 days 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;N&quot;</span><span class="p">,</span> <span class="s2">&quot;ns&quot;</span><span class="p">],</span> <span class="s2">&quot;nanoseconds is not supported&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">TimedeltaIndex</span><span class="p">,</span>
<span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span>
<span class="n">pd</span><span class="o">.</span><span class="n">timedelta_range</span><span class="p">(</span>
<span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span>
<span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span>
<span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span>
<span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span>
<span class="n">closed</span><span class="o">=</span><span class="n">closed</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="get_dummies"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.get_dummies.html#pyspark.pandas.get_dummies">[docs]</a><span class="k">def</span> <span class="nf">get_dummies</span><span class="p">(</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">prefix</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">prefix_sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;_&quot;</span><span class="p">,</span>
<span class="n">dummy_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">sparse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">drop_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert categorical variable into dummy/indicator variables, also</span>
<span class="sd"> known as one hot encoding.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> data : array-like, Series, or DataFrame</span>
<span class="sd"> prefix : string, list of strings, or dict of strings, default None</span>
<span class="sd"> String to append DataFrame column names.</span>
<span class="sd"> Pass a list with length equal to the number of columns</span>
<span class="sd"> when calling get_dummies on a DataFrame. Alternatively, `prefix`</span>
<span class="sd"> can be a dictionary mapping column names to prefixes.</span>
<span class="sd"> prefix_sep : string, default &#39;_&#39;</span>
<span class="sd"> If appending prefix, separator/delimiter to use. Or pass a</span>
<span class="sd"> list or dictionary as with `prefix.`</span>
<span class="sd"> dummy_na : bool, default False</span>
<span class="sd"> Add a column to indicate NaNs, if False NaNs are ignored.</span>
<span class="sd"> columns : list-like, default None</span>
<span class="sd"> Column names in the DataFrame to be encoded.</span>
<span class="sd"> If `columns` is None then all the columns with</span>
<span class="sd"> `object` or `category` dtype will be converted.</span>
<span class="sd"> sparse : bool, default False</span>
<span class="sd"> Whether the dummy-encoded columns should be be backed by</span>
<span class="sd"> a :class:`SparseArray` (True) or a regular NumPy array (False).</span>
<span class="sd"> In pandas-on-Spark, this value must be &quot;False&quot;.</span>
<span class="sd"> drop_first : bool, default False</span>
<span class="sd"> Whether to get k-1 dummies out of k categorical levels by removing the</span>
<span class="sd"> first level.</span>
<span class="sd"> dtype : dtype, default np.uint8</span>
<span class="sd"> Data type for new columns. Only a single dtype is allowed.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dummies : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.str.get_dummies</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(list(&#39;abca&#39;))</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(s)</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 0 0</span>
<span class="sd"> 1 0 1 0</span>
<span class="sd"> 2 0 0 1</span>
<span class="sd"> 3 1 0 0</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;a&#39;], &#39;B&#39;: [&#39;b&#39;, &#39;a&#39;, &#39;c&#39;],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 3]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(df, prefix=[&#39;col1&#39;, &#39;col2&#39;])</span>
<span class="sd"> C col1_a col1_b col2_a col2_b col2_c</span>
<span class="sd"> 0 1 1 0 0 1 0</span>
<span class="sd"> 1 2 0 1 1 0 0</span>
<span class="sd"> 2 3 1 0 0 0 1</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abcaa&#39;)))</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 0 0</span>
<span class="sd"> 1 0 1 0</span>
<span class="sd"> 2 0 0 1</span>
<span class="sd"> 3 1 0 0</span>
<span class="sd"> 4 1 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abcaa&#39;)), drop_first=True)</span>
<span class="sd"> b c</span>
<span class="sd"> 0 0 0</span>
<span class="sd"> 1 1 0</span>
<span class="sd"> 2 0 1</span>
<span class="sd"> 3 0 0</span>
<span class="sd"> 4 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abc&#39;)), dtype=float)</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1.0 0.0 0.0</span>
<span class="sd"> 1 0.0 1.0 0.0</span>
<span class="sd"> 2 0.0 0.0 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">sparse</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;get_dummies currently does not support sparse&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Input must be a list-like for parameter `columns`&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;byte&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">prefix</span><span class="p">)]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;get_dummies currently does not support prefix as string types&quot;</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_default_accept_types</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span> <span class="o">==</span> <span class="n">columns</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span>
<span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="p">:])</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span>
<span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span>
<span class="k">else</span> <span class="s2">&quot;&quot;</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">]</span>
<span class="k">elif</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">any</span><span class="p">(</span>
<span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Expected tuple, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">type</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">))</span><span class="o">.</span><span class="n">pop</span><span class="p">())</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">columns</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="o">==</span> <span class="n">key</span> <span class="ow">or</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">key</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">{}</span><span class="s2"> not in index&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="n">column_labels_set</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">(</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span>
<span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">else</span> <span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_set</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span>
<span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;get_dummies currently only accept </span><span class="si">{}</span><span class="s2"> values&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="s2">&quot;, &quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
<span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="n">t</span><span class="p">)</span><span class="o">.</span><span class="n">typeName</span><span class="p">()</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Length of &#39;prefix&#39; (</span><span class="si">{}</span><span class="s2">) did not match the length of &quot;</span>
<span class="s2">&quot;the columns being encoded (</span><span class="si">{}</span><span class="s2">).&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="n">prefix</span><span class="p">[</span><span class="n">column_label</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> <span class="k">for</span> <span class="n">column_label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="n">all_values</span> <span class="o">=</span> <span class="n">_reduce_spark_multi</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span>
<span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">):</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">all_values</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="n">values</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="k">if</span> <span class="n">drop_first</span><span class="p">:</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
<span class="k">def</span> <span class="nf">column_name</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Name</span><span class="p">:</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">prefix</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;&quot;</span><span class="p">:</span> <span class="c1"># type: ignore[index]</span>
<span class="k">return</span> <span class="n">v</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;</span><span class="si">{}{}{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">prefix</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">prefix_sep</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> <span class="c1"># type: ignore[index]</span>
<span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">values</span><span class="p">:</span>
<span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">notnull</span><span class="p">()</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="o">==</span> <span class="n">value</span><span class="p">))</span>
<span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">value</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dummy_na</span><span class="p">:</span>
<span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">remaining_columns</span><span class="p">]</span></div>
<span class="c1"># TODO: there are many parameters to implement and support. See pandas&#39;s pd.concat.</span>
<div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.concat.html#pyspark.pandas.concat">[docs]</a><span class="k">def</span> <span class="nf">concat</span><span class="p">(</span>
<span class="n">objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]],</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;outer&quot;</span><span class="p">,</span>
<span class="n">ignore_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenate pandas-on-Spark objects along a particular axis with optional set logic</span>
<span class="sd"> along the other axes.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> objs : a sequence of Series or DataFrame</span>
<span class="sd"> Any None objects will be dropped silently unless</span>
<span class="sd"> they are all None in which case a ValueError will be raised</span>
<span class="sd"> axis : {0/&#39;index&#39;, 1/&#39;columns&#39;}, default 0</span>
<span class="sd"> The axis to concatenate along.</span>
<span class="sd"> join : {&#39;inner&#39;, &#39;outer&#39;}, default &#39;outer&#39;</span>
<span class="sd"> How to handle indexes on other axis (or axes).</span>
<span class="sd"> ignore_index : bool, default False</span>
<span class="sd"> If True, do not use the index values along the concatenation axis. The</span>
<span class="sd"> resulting axis will be labeled 0, ..., n - 1. This is useful if you are</span>
<span class="sd"> concatenating objects where the concatenation axis does not have</span>
<span class="sd"> meaningful indexing information. Note the index values on the other</span>
<span class="sd"> axes are still respected in the join.</span>
<span class="sd"> sort : bool, default False</span>
<span class="sd"> Sort non-concatenation axis if it is not already aligned.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> object, type of objs</span>
<span class="sd"> When concatenating all ``Series`` along the index (axis=0), a</span>
<span class="sd"> ``Series`` is returned. When ``objs`` contains at least one</span>
<span class="sd"> ``DataFrame``, a ``DataFrame`` is returned. When concatenating along</span>
<span class="sd"> the columns (axis=1), a ``DataFrame`` is returned.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.join : Join DataFrames using indexes.</span>
<span class="sd"> DataFrame.merge : Merge DataFrames by indexes or columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.pandas.config import set_option, reset_option</span>
<span class="sd"> &gt;&gt;&gt; set_option(&quot;compute.ops_on_diff_frames&quot;, True)</span>
<span class="sd"> Combine two ``Series``.</span>
<span class="sd"> &gt;&gt;&gt; s1 = ps.Series([&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; s2 = ps.Series([&#39;c&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([s1, s2])</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 b</span>
<span class="sd"> 0 c</span>
<span class="sd"> 1 d</span>
<span class="sd"> dtype: object</span>
<span class="sd"> Clear the existing index and reset it in the result</span>
<span class="sd"> by setting the ``ignore_index`` option to ``True``.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([s1, s2], ignore_index=True)</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 b</span>
<span class="sd"> 2 c</span>
<span class="sd"> 3 d</span>
<span class="sd"> dtype: object</span>
<span class="sd"> Combine two ``DataFrame`` objects with identical columns.</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame([[&#39;a&#39;, 1], [&#39;b&#39;, 2]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame([[&#39;c&#39;, 3], [&#39;d&#39;, 4]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df2])</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> Combine ``DataFrame`` and ``Series`` objects with different columns.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df2, s1])</span>
<span class="sd"> letter number 0</span>
<span class="sd"> 0 c 3.0 None</span>
<span class="sd"> 1 d 4.0 None</span>
<span class="sd"> 0 None NaN a</span>
<span class="sd"> 1 None NaN b</span>
<span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span>
<span class="sd"> and return everything. Columns outside the intersection will</span>
<span class="sd"> be filled with ``None`` values.</span>
<span class="sd"> &gt;&gt;&gt; df3 = ps.DataFrame([[&#39;c&#39;, 3, &#39;cat&#39;], [&#39;d&#39;, 4, &#39;dog&#39;]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;, &#39;animal&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df3</span>
<span class="sd"> letter number animal</span>
<span class="sd"> 0 c 3 cat</span>
<span class="sd"> 1 d 4 dog</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3])</span>
<span class="sd"> letter number animal</span>
<span class="sd"> 0 a 1 None</span>
<span class="sd"> 1 b 2 None</span>
<span class="sd"> 0 c 3 cat</span>
<span class="sd"> 1 d 4 dog</span>
<span class="sd"> Sort the columns.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3], sort=True)</span>
<span class="sd"> animal letter number</span>
<span class="sd"> 0 None a 1</span>
<span class="sd"> 1 None b 2</span>
<span class="sd"> 0 cat c 3</span>
<span class="sd"> 1 dog d 4</span>
<span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span>
<span class="sd"> and return only those that are shared by passing ``inner`` to</span>
<span class="sd"> the ``join`` keyword argument.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3], join=&quot;inner&quot;)</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> &gt;&gt;&gt; df4 = ps.DataFrame([[&#39;bird&#39;, &#39;polly&#39;], [&#39;monkey&#39;, &#39;george&#39;]],</span>
<span class="sd"> ... columns=[&#39;animal&#39;, &#39;name&#39;])</span>
<span class="sd"> Combine with column axis.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df4], axis=1)</span>
<span class="sd"> letter number animal name</span>
<span class="sd"> 0 a 1 bird polly</span>
<span class="sd"> 1 b 2 monkey george</span>
<span class="sd"> &gt;&gt;&gt; reset_option(&quot;compute.ops_on_diff_frames&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">objs</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">))</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">objs</span><span class="p">,</span> <span class="n">Iterable</span>
<span class="p">):</span> <span class="c1"># TODO: support dict</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;first argument must be an iterable of pandas-on-Spark &quot;</span>
<span class="s2">&quot;objects, you passed an object of type &quot;</span>
<span class="s1">&#39;&quot;</span><span class="si">{name}</span><span class="s1">&quot;&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Sized</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No objects to concatenate&quot;</span><span class="p">)</span>
<span class="n">objs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="n">obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;All objects passed were None&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;cannot concatenate object of type &quot;</span>
<span class="s2">&quot;&#39;</span><span class="si">{name}</span><span class="s2">&quot;</span>
<span class="s2">&quot;; only ps.Series &quot;</span>
<span class="s2">&quot;and ps.DataFrame are valid&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">join</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span> <span class="s2">&quot;outer&quot;</span><span class="p">]:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Only can inner (intersect) or outer (union) join the other axis.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">([</span><span class="n">obj</span><span class="o">.</span><span class="n">empty</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">]):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of array concatenation with empty entries is &quot;</span>
<span class="s2">&quot;deprecated. In a future version, this will no longer exclude &quot;</span>
<span class="s2">&quot;empty items when determining the result dtype. &quot;</span>
<span class="s2">&quot;To retain the old behavior, exclude the empty entries before &quot;</span>
<span class="s2">&quot;the concat operation.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">obj</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span>
<span class="p">]</span>
<span class="n">level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">DataFrame</span><span class="o">.</span><span class="n">_index_normalized_frame</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">&gt;</span> <span class="n">level</span>
<span class="k">else</span> <span class="n">psdf</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">psdfs_not_same_anchor</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">1</span><span class="p">:]:</span>
<span class="n">duplicated</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">duplicated</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">pretty_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicated</span><span class="p">]</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Labels have to be unique; however, got duplicated labels </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="n">pretty_names</span>
<span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span>
<span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">concat_psdf</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psdfs_not_same_anchor</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdfs_not_same_anchor</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">resolve_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">AssertionError</span><span class="p">(</span><span class="s2">&quot;This should not happen.&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs_not_same_anchor</span><span class="p">:</span>
<span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;inner&quot;</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">resolve_func</span><span class="p">,</span>
<span class="n">concat_psdf</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;outer&quot;</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">resolve_func</span><span class="p">,</span>
<span class="n">concat_psdf</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span>
<span class="nb">map</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">_range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="n">columns</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">columns</span><span class="o">.</span><span class="n">nlevels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="s2">&quot;columns&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">concat_psdf</span>
<span class="c1"># Series, Series ...</span>
<span class="c1"># We should return Series if objects are all Series.</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="nb">all</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">),</span> <span class="n">objs</span><span class="p">))</span>
<span class="c1"># DataFrame, Series ... &amp; Series, Series ...</span>
<span class="c1"># In this case, we should return DataFrame.</span>
<span class="n">new_objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">num_series</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">series_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">num_series</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">series_names</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">should_return_series</span><span class="p">:</span>
<span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span><span class="n">DEFAULT_SERIES_NAME</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span>
<span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="n">column_labels_levels</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels_levels</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;MultiIndex columns should have the same levels&quot;</span><span class="p">)</span>
<span class="c1"># DataFrame, DataFrame, ...</span>
<span class="c1"># All Series are converted into DataFrame and then compute concat.</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">indices_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">index</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="n">index_of_first_psdf</span> <span class="o">=</span> <span class="n">indices_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">for</span> <span class="n">index_of_psdf</span> <span class="ow">in</span> <span class="n">indices_of_psdfs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span> <span class="o">!=</span> <span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Index type and names should be same in the objects to concatenate. &quot;</span>
<span class="s2">&quot;You passed different indices &quot;</span>
<span class="s2">&quot;</span><span class="si">{index_of_first_psdf}</span><span class="s2"> and </span><span class="si">{index_of_psdf}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">index_of_first_psdf</span><span class="o">=</span><span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span>
<span class="n">index_of_psdf</span><span class="o">=</span><span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">column_labels_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="n">index_names_of_psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[[]</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="n">name</span> <span class="o">==</span> <span class="n">index_names_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">index_names_of_psdfs</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span>
<span class="n">idx</span> <span class="o">==</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span>
<span class="p">):</span>
<span class="c1"># If all columns are in the same order and values, use it.</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="n">new_objs</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;inner&quot;</span><span class="p">:</span>
<span class="n">interested_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="o">*</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">set</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">column_labels_of_psdfs</span><span class="p">))</span>
<span class="c1"># Keep the column order with its firsts DataFrame.</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">interested_columns</span>
<span class="p">]</span>
<span class="c1"># If sort is True, sort to follow pandas 1.4+ behavior.</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="c1"># FIXME: better ordering</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">]</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;outer&quot;</span><span class="p">:</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">labels</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">:</span>
<span class="n">merged_columns</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">merged_columns</span><span class="p">)</span>
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="c1"># If sort is True, always sort</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="c1"># FIXME: better ordering</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">:</span>
<span class="n">columns_to_add</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span>
<span class="c1"># TODO: NaN and None difference for missing values. pandas seems to be filling NaN.</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> <span class="o">+</span> <span class="p">[</span>
<span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span>
<span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="o">+</span> <span class="n">columns_to_add</span><span class="p">),</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> <span class="o">+</span> <span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns_to_add</span><span class="p">))),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">psdfs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">])</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="n">concatenated</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">y</span><span class="p">),</span> <span class="n">sdfs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="n">result_psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">concatenated</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="c1"># TODO: dtypes?</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span>
<span class="c1"># If all input were Series, we should return Series.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_names</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">series_names</span><span class="o">.</span><span class="n">pop</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result_psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">result_psdf</span></div>
<div class="viewcode-block" id="melt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.melt.html#pyspark.pandas.melt">[docs]</a><span class="k">def</span> <span class="nf">melt</span><span class="p">(</span>
<span class="n">frame</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">id_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">value_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">var_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">value_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;value&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="n">id_vars</span><span class="p">,</span> <span class="n">value_vars</span><span class="p">,</span> <span class="n">var_name</span><span class="p">,</span> <span class="n">value_name</span><span class="p">)</span></div>
<span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="isna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.isna.html#pyspark.pandas.isna">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">isna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect missing values for an array-like object.</span>
<span class="sd"> This function takes a scalar or array-like object and indicates</span>
<span class="sd"> whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``</span>
<span class="sd"> in object arrays).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> obj : scalar or array-like</span>
<span class="sd"> Object to check for null or missing values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool or array-like of bool</span>
<span class="sd"> For scalar input, returns a scalar boolean.</span>
<span class="sd"> For array input, returns an array of boolean indicating whether each</span>
<span class="sd"> corresponding element is missing.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.isna : Detect missing values in a Series.</span>
<span class="sd"> Series.isnull : Detect missing values in a Series.</span>
<span class="sd"> DataFrame.isna : Detect missing values in a DataFrame.</span>
<span class="sd"> DataFrame.isnull : Detect missing values in a DataFrame.</span>
<span class="sd"> Index.isna : Detect missing values in an Index.</span>
<span class="sd"> Index.isnull : Detect missing values in an Index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Scalar arguments (including strings) result in a scalar boolean.</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(&#39;dog&#39;)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(np.nan)</span>
<span class="sd"> True</span>
<span class="sd"> ndarrays result in an ndarray of booleans.</span>
<span class="sd"> &gt;&gt;&gt; array = np.array([[1, np.nan, 3], [4, 5, np.nan]])</span>
<span class="sd"> &gt;&gt;&gt; array</span>
<span class="sd"> array([[ 1., nan, 3.],</span>
<span class="sd"> [ 4., 5., nan]])</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(array)</span>
<span class="sd"> array([[False, True, False],</span>
<span class="sd"> [False, False, True]])</span>
<span class="sd"> For Series and DataFrame, the same type is returned, containing booleans.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [&#39;ant&#39;, &#39;bee&#39;, &#39;cat&#39;], &#39;b&#39;: [&#39;dog&#39;, None, &#39;fly&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 ant dog</span>
<span class="sd"> 1 bee None</span>
<span class="sd"> 2 cat fly</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(df)</span>
<span class="sd"> a b</span>
<span class="sd"> 0 False False</span>
<span class="sd"> 1 False True</span>
<span class="sd"> 2 False False</span>
<span class="sd"> &gt;&gt;&gt; ps.isnull(df.b)</span>
<span class="sd"> 0 False</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> Name: b, dtype: bool</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># TODO: Add back:</span>
<span class="c1"># notnull : Boolean inverse of pandas.isnull.</span>
<span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div>
<span class="n">isnull</span> <span class="o">=</span> <span class="n">isna</span>
<div class="viewcode-block" id="notna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.notna.html#pyspark.pandas.notna">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are not NA.</span>
<span class="sd"> Non-missing values get mapped to True. NA values, such as None or</span>
<span class="sd"> :attr:`numpy.NaN`, get mapped to False values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool or array-like of bool</span>
<span class="sd"> Mask of bool values for each element that</span>
<span class="sd"> indicates whether an element is not an NA value.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> isna : Detect missing values for an array-like object.</span>
<span class="sd"> Series.notna : Boolean inverse of Series.isna.</span>
<span class="sd"> DataFrame.notnull : Boolean inverse of DataFrame.isnull.</span>
<span class="sd"> Index.notna : Boolean inverse of Index.isna.</span>
<span class="sd"> Index.notnull : Boolean inverse of Index.isnull.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Show which entries in a DataFrame are not NA.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;age&#39;: [5, 6, np.NaN],</span>
<span class="sd"> ... &#39;born&#39;: [pd.NaT, pd.Timestamp(&#39;1939-05-27&#39;),</span>
<span class="sd"> ... pd.Timestamp(&#39;1940-04-25&#39;)],</span>
<span class="sd"> ... &#39;name&#39;: [&#39;Alfred&#39;, &#39;Batman&#39;, &#39;&#39;],</span>
<span class="sd"> ... &#39;toy&#39;: [None, &#39;Batmobile&#39;, &#39;Joker&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> age born name toy</span>
<span class="sd"> 0 5.0 NaT Alfred None</span>
<span class="sd"> 1 6.0 1939-05-27 Batman Batmobile</span>
<span class="sd"> 2 NaN 1940-04-25 Joker</span>
<span class="sd"> &gt;&gt;&gt; df.notnull()</span>
<span class="sd"> age born name toy</span>
<span class="sd"> 0 True False True False</span>
<span class="sd"> 1 True True True True</span>
<span class="sd"> 2 False True True True</span>
<span class="sd"> Show which entries in a Series are not NA.</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.NaN])</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 5.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; ps.notna(ser)</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ps.notna(ser.index)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># TODO: Add back:</span>
<span class="c1"># Series.notnull :Boolean inverse of Series.isnull.</span>
<span class="c1"># DataFrame.notna :Boolean inverse of DataFrame.isna.</span>
<span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">notna</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div>
<span class="n">notnull</span> <span class="o">=</span> <span class="n">notna</span>
<div class="viewcode-block" id="merge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge.html#pyspark.pandas.merge">[docs]</a><span class="k">def</span> <span class="nf">merge</span><span class="p">(</span>
<span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">right</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">&quot;_x&quot;</span><span class="p">,</span> <span class="s2">&quot;_y&quot;</span><span class="p">),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge DataFrame objects with a database-style join.</span>
<span class="sd"> The index of the resulting DataFrame will be one of the following:</span>
<span class="sd"> - 0...n if no index is used for merging</span>
<span class="sd"> - Index of the left DataFrame if merged only on the index of the right DataFrame</span>
<span class="sd"> - Index of the right DataFrame if merged only on the index of the left DataFrame</span>
<span class="sd"> - All involved indices if merged using the indices of both DataFrames</span>
<span class="sd"> e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will</span>
<span class="sd"> be an index (x, a, b)</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> right: Object to merge with.</span>
<span class="sd"> how: Type of merge to be performed.</span>
<span class="sd"> {&#39;left&#39;, &#39;right&#39;, &#39;outer&#39;, &#39;inner&#39;}, default &#39;inner&#39;</span>
<span class="sd"> left: use only keys from left frame, like a SQL left outer join; preserve key</span>
<span class="sd"> order.</span>
<span class="sd"> right: use only keys from right frame, like a SQL right outer join; preserve key</span>
<span class="sd"> order.</span>
<span class="sd"> outer: use union of keys from both frames, like a SQL full outer join; sort keys</span>
<span class="sd"> lexicographically.</span>
<span class="sd"> inner: use intersection of keys from both frames, like a SQL inner join;</span>
<span class="sd"> preserve the order of the left keys.</span>
<span class="sd"> on: Column or index level names to join on. These must be found in both DataFrames. If on</span>
<span class="sd"> is None and not merging on indexes then this defaults to the intersection of the</span>
<span class="sd"> columns in both DataFrames.</span>
<span class="sd"> left_on: Column or index level names to join on in the left DataFrame. Can also</span>
<span class="sd"> be an array or list of arrays of the length of the left DataFrame.</span>
<span class="sd"> These arrays are treated as if they are columns.</span>
<span class="sd"> right_on: Column or index level names to join on in the right DataFrame. Can also</span>
<span class="sd"> be an array or list of arrays of the length of the right DataFrame.</span>
<span class="sd"> These arrays are treated as if they are columns.</span>
<span class="sd"> left_index: Use the index from the left DataFrame as the join key(s). If it is a</span>
<span class="sd"> MultiIndex, the number of keys in the other DataFrame (either the index or a number of</span>
<span class="sd"> columns) must match the number of levels.</span>
<span class="sd"> right_index: Use the index from the right DataFrame as the join key. Same caveats as</span>
<span class="sd"> left_index.</span>
<span class="sd"> suffixes: Suffix to apply to overlapping column names in the left and right side,</span>
<span class="sd"> respectively.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> A DataFrame of the two merged objects.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame({&#39;lkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [1, 2, 3, 5]},</span>
<span class="sd"> ... columns=[&#39;lkey&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame({&#39;rkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [5, 6, 7, 8]},</span>
<span class="sd"> ... columns=[&#39;rkey&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1</span>
<span class="sd"> lkey value</span>
<span class="sd"> 0 foo 1</span>
<span class="sd"> 1 bar 2</span>
<span class="sd"> 2 baz 3</span>
<span class="sd"> 3 foo 5</span>
<span class="sd"> &gt;&gt;&gt; df2</span>
<span class="sd"> rkey value</span>
<span class="sd"> 0 foo 5</span>
<span class="sd"> 1 bar 6</span>
<span class="sd"> 2 baz 7</span>
<span class="sd"> 3 foo 8</span>
<span class="sd"> Merge df1 and df2 on the lkey and rkey columns. The value columns have</span>
<span class="sd"> the default suffixes, _x and _y, appended.</span>
<span class="sd"> &gt;&gt;&gt; merged = ps.merge(df1, df2, left_on=&#39;lkey&#39;, right_on=&#39;rkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; merged.sort_values(by=[&#39;lkey&#39;, &#39;value_x&#39;, &#39;rkey&#39;, &#39;value_y&#39;]) # doctest: +ELLIPSIS</span>
<span class="sd"> lkey value_x rkey value_y</span>
<span class="sd"> ...bar 2 bar 6</span>
<span class="sd"> ...baz 3 baz 7</span>
<span class="sd"> ...foo 1 foo 5</span>
<span class="sd"> ...foo 1 foo 8</span>
<span class="sd"> ...foo 5 foo 5</span>
<span class="sd"> ...foo 5 foo 8</span>
<span class="sd"> &gt;&gt;&gt; left_psdf = ps.DataFrame({&#39;A&#39;: [1, 2]})</span>
<span class="sd"> &gt;&gt;&gt; right_psdf = ps.DataFrame({&#39;B&#39;: [&#39;x&#39;, &#39;y&#39;]}, index=[1, 2])</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 2 x</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;left&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 None</span>
<span class="sd"> 1 2 x</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;right&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 2.0 x</span>
<span class="sd"> 2 NaN y</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;outer&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1.0 None</span>
<span class="sd"> 1 2.0 x</span>
<span class="sd"> 2 NaN y</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> As described in #263, joining string columns currently returns None for missing values</span>
<span class="sd"> instead of NaN.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span>
<span class="n">right</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span>
<span class="n">left_on</span><span class="o">=</span><span class="n">left_on</span><span class="p">,</span>
<span class="n">right_on</span><span class="o">=</span><span class="n">right_on</span><span class="p">,</span>
<span class="n">left_index</span><span class="o">=</span><span class="n">left_index</span><span class="p">,</span>
<span class="n">right_index</span><span class="o">=</span><span class="n">right_index</span><span class="p">,</span>
<span class="n">suffixes</span><span class="o">=</span><span class="n">suffixes</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="merge_asof"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge_asof.html#pyspark.pandas.merge_asof">[docs]</a><span class="k">def</span> <span class="nf">merge_asof</span><span class="p">(</span>
<span class="n">left</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">right</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">&quot;_x&quot;</span><span class="p">,</span> <span class="s2">&quot;_y&quot;</span><span class="p">),</span>
<span class="n">tolerance</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">allow_exact_matches</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">direction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;backward&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Perform an asof merge.</span>
<span class="sd"> This is like a left-join except that we match on nearest</span>
<span class="sd"> key rather than equal keys.</span>
<span class="sd"> For each row in the left DataFrame:</span>
<span class="sd"> - A &quot;backward&quot; search selects the last row in the right DataFrame whose</span>
<span class="sd"> &#39;on&#39; key is less than or equal to the left&#39;s key.</span>
<span class="sd"> - A &quot;forward&quot; search selects the first row in the right DataFrame whose</span>
<span class="sd"> &#39;on&#39; key is greater than or equal to the left&#39;s key.</span>
<span class="sd"> - A &quot;nearest&quot; search selects the row in the right DataFrame who&#39;s &#39;on&#39;</span>
<span class="sd"> key is closest in absolute distance to the left&#39;s key.</span>
<span class="sd"> Optionally match on equivalent keys with &#39;by&#39; before searching with &#39;on&#39;.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : DataFrame or named Series</span>
<span class="sd"> right : DataFrame or named Series</span>
<span class="sd"> on : label</span>
<span class="sd"> Field name to join on. Must be found in both DataFrames.</span>
<span class="sd"> The data MUST be ordered. This must be a numeric column,</span>
<span class="sd"> such as datetimelike, integer, or float. On or left_on/right_on</span>
<span class="sd"> must be given.</span>
<span class="sd"> left_on : label</span>
<span class="sd"> Field name to join on in left DataFrame.</span>
<span class="sd"> right_on : label</span>
<span class="sd"> Field name to join on in right DataFrame.</span>
<span class="sd"> left_index : bool</span>
<span class="sd"> Use the index of the left DataFrame as the join key.</span>
<span class="sd"> right_index : bool</span>
<span class="sd"> Use the index of the right DataFrame as the join key.</span>
<span class="sd"> by : column name or list of column names</span>
<span class="sd"> Match on these columns before performing merge operation.</span>
<span class="sd"> left_by : column name</span>
<span class="sd"> Field names to match on in the left DataFrame.</span>
<span class="sd"> right_by : column name</span>
<span class="sd"> Field names to match on in the right DataFrame.</span>
<span class="sd"> suffixes : 2-length sequence (tuple, list, ...)</span>
<span class="sd"> Suffix to apply to overlapping column names in the left and right</span>
<span class="sd"> side, respectively.</span>
<span class="sd"> tolerance : int or Timedelta, optional, default None</span>
<span class="sd"> Select asof tolerance within this range; must be compatible</span>
<span class="sd"> with the merge index.</span>
<span class="sd"> allow_exact_matches : bool, default True</span>
<span class="sd"> - If True, allow matching with the same &#39;on&#39; value</span>
<span class="sd"> (i.e. less-than-or-equal-to / greater-than-or-equal-to)</span>
<span class="sd"> - If False, don&#39;t match the same &#39;on&#39; value</span>
<span class="sd"> (i.e., strictly less-than / strictly greater-than).</span>
<span class="sd"> direction : &#39;backward&#39; (default), &#39;forward&#39;, or &#39;nearest&#39;</span>
<span class="sd"> Whether to search for prior, subsequent, or closest matches.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> merged : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> merge : Merge with a database-style join.</span>
<span class="sd"> merge_ordered : Merge with optional filling/interpolation.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; left = ps.DataFrame({&quot;a&quot;: [1, 5, 10], &quot;left_val&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; left</span>
<span class="sd"> a left_val</span>
<span class="sd"> 0 1 a</span>
<span class="sd"> 1 5 b</span>
<span class="sd"> 2 10 c</span>
<span class="sd"> &gt;&gt;&gt; right = ps.DataFrame({&quot;a&quot;: [1, 2, 3, 6, 7], &quot;right_val&quot;: [1, 2, 3, 6, 7]})</span>
<span class="sd"> &gt;&gt;&gt; right</span>
<span class="sd"> a right_val</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 2 2</span>
<span class="sd"> 2 3 3</span>
<span class="sd"> 3 6 6</span>
<span class="sd"> 4 7 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(left, right, on=&quot;a&quot;).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1</span>
<span class="sd"> 1 5 b 3</span>
<span class="sd"> 2 10 c 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... allow_exact_matches=False</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a NaN</span>
<span class="sd"> 1 5 b 3.0</span>
<span class="sd"> 2 10 c 7.0</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... direction=&quot;forward&quot;</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1.0</span>
<span class="sd"> 1 5 b 6.0</span>
<span class="sd"> 2 10 c NaN</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... direction=&quot;nearest&quot;</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1</span>
<span class="sd"> 1 5 b 6</span>
<span class="sd"> 2 10 c 7</span>
<span class="sd"> We can use indexed DataFrames as well.</span>
<span class="sd"> &gt;&gt;&gt; left = ps.DataFrame({&quot;left_val&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]}, index=[1, 5, 10])</span>
<span class="sd"> &gt;&gt;&gt; left</span>
<span class="sd"> left_val</span>
<span class="sd"> 1 a</span>
<span class="sd"> 5 b</span>
<span class="sd"> 10 c</span>
<span class="sd"> &gt;&gt;&gt; right = ps.DataFrame({&quot;right_val&quot;: [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])</span>
<span class="sd"> &gt;&gt;&gt; right</span>
<span class="sd"> right_val</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> 6 6</span>
<span class="sd"> 7 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(left, right, left_index=True, right_index=True).sort_index()</span>
<span class="sd"> left_val right_val</span>
<span class="sd"> 1 a 1</span>
<span class="sd"> 5 b 3</span>
<span class="sd"> 10 c 7</span>
<span class="sd"> Here is a real-world times-series example</span>
<span class="sd"> &gt;&gt;&gt; quotes = ps.DataFrame(</span>
<span class="sd"> ... {</span>
<span class="sd"> ... &quot;time&quot;: [</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.030&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.041&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.049&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.072&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.075&quot;)</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;ticker&quot;: [</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;AAPL&quot;,</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;bid&quot;: [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],</span>
<span class="sd"> ... &quot;ask&quot;: [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]</span>
<span class="sd"> ... }</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; quotes</span>
<span class="sd"> time ticker bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93</span>
<span class="sd"> 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96</span>
<span class="sd"> 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98</span>
<span class="sd"> 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93</span>
<span class="sd"> 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01</span>
<span class="sd"> 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88</span>
<span class="sd"> 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03</span>
<span class="sd"> &gt;&gt;&gt; trades = ps.DataFrame(</span>
<span class="sd"> ... {</span>
<span class="sd"> ... &quot;time&quot;: [</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.038&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;)</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;ticker&quot;: [&quot;MSFT&quot;, &quot;MSFT&quot;, &quot;GOOG&quot;, &quot;GOOG&quot;, &quot;AAPL&quot;],</span>
<span class="sd"> ... &quot;price&quot;: [51.95, 51.95, 720.77, 720.92, 98.0],</span>
<span class="sd"> ... &quot;quantity&quot;: [75, 155, 100, 100, 100]</span>
<span class="sd"> ... }</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; trades</span>
<span class="sd"> time ticker price quantity</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 GOOG 720.77 100</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.92 100</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 AAPL 98.00 100</span>
<span class="sd"> By default we are taking the asof of the quotes</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades, quotes, on=&quot;time&quot;, by=&quot;ticker&quot;</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span>
<span class="sd"> We only asof within 2ms between the quote time and the trade time</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades,</span>
<span class="sd"> ... quotes,</span>
<span class="sd"> ... on=&quot;time&quot;,</span>
<span class="sd"> ... by=&quot;ticker&quot;,</span>
<span class="sd"> ... tolerance=sf.expr(&quot;INTERVAL 2 MILLISECONDS&quot;) # pd.Timedelta(&quot;2ms&quot;)</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span>
<span class="sd"> We only asof within 10ms between the quote time and the trade time</span>
<span class="sd"> and we exclude exact matches on time. However *prior* data will</span>
<span class="sd"> propagate forward</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades,</span>
<span class="sd"> ... quotes,</span>
<span class="sd"> ... on=&quot;time&quot;,</span>
<span class="sd"> ... by=&quot;ticker&quot;,</span>
<span class="sd"> ... tolerance=sf.expr(&quot;INTERVAL 10 MILLISECONDS&quot;), # pd.Timedelta(&quot;10ms&quot;)</span>
<span class="sd"> ... allow_exact_matches=False</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">to_list</span><span class="p">(</span><span class="n">os</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">os</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[]</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">os</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">os</span><span class="p">)]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">os</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[(</span><span class="n">os</span><span class="p">,)]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">o</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">o</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">o</span><span class="p">,)</span> <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">os</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">left</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="n">on</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_on</span> <span class="ow">or</span> <span class="n">right_on</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s1">&#39;Can only pass argument &quot;on&quot; OR &quot;left_on&quot; and &quot;right_on&quot;, &#39;</span>
<span class="s2">&quot;not a combination of both.&quot;</span>
<span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_index</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;left can only have one index&quot;</span><span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_on</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">right_index</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;right can only have one index&quot;</span><span class="p">)</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_on</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Must pass right_on or right_index=True&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">right_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_as_of_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Must pass left_on or left_index=True&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span>
<span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">common</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;No common columns to perform merge on. Merge options: &quot;</span>
<span class="s2">&quot;left_on=None, right_on=None, left_index=False, right_index=False&quot;</span>
<span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;can only asof on a key for left&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;can only asof on a key for right&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_by</span> <span class="ow">or</span> <span class="n">right_by</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">&#39;Can only pass argument &quot;by&quot; OR &quot;left_by&quot; and &quot;right_by&quot;.&#39;</span><span class="p">)</span>
<span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_by</span><span class="p">)))</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_by</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_join_on_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;missing right_by&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">right_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_join_on_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;missing left_by&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_join_on_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_join_on_names</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;left_by and right_by must be same length&quot;</span><span class="p">)</span>
<span class="c1"># We should distinguish the name to avoid ambiguous column name after merging.</span>
<span class="n">right_prefix</span> <span class="o">=</span> <span class="s2">&quot;__right_&quot;</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_as_of_name</span> <span class="k">for</span> <span class="n">right_as_of_name</span> <span class="ow">in</span> <span class="n">right_as_of_names</span><span class="p">]</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_join_on_name</span> <span class="k">for</span> <span class="n">right_join_on_name</span> <span class="ow">in</span> <span class="n">right_join_on_names</span>
<span class="p">]</span>
<span class="n">left_as_of_name</span> <span class="o">=</span> <span class="n">left_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">right_as_of_name</span> <span class="o">=</span> <span class="n">right_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">resolve</span><span class="p">(</span><span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">side</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">rename</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;__</span><span class="si">{}</span><span class="s2">_</span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">side</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">HIDDEN_COLUMNS</span>
<span class="p">],</span>
<span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">left_internal</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="n">right_internal</span> <span class="o">=</span> <span class="n">resolve</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="s2">&quot;right&quot;</span><span class="p">)</span>
<span class="n">left_table</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;left_table&quot;</span><span class="p">)</span>
<span class="n">right_table</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;right_table&quot;</span><span class="p">)</span>
<span class="n">left_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">left_as_of_name</span><span class="p">)</span>
<span class="n">right_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">right_as_of_name</span><span class="p">)</span>
<span class="k">if</span> <span class="n">left_join_on_names</span><span class="p">:</span>
<span class="n">left_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_join_on_names</span><span class="p">]</span>
<span class="n">right_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_join_on_names</span><span class="p">]</span>
<span class="n">on</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span><span class="p">:</span> <span class="n">lft</span> <span class="o">&amp;</span> <span class="n">rgt</span><span class="p">,</span>
<span class="p">[</span><span class="n">lft</span> <span class="o">==</span> <span class="n">rgt</span> <span class="k">for</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_join_on_columns</span><span class="p">,</span> <span class="n">right_join_on_columns</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">on</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">Column</span> <span class="o">=</span> <span class="n">get_column_class</span><span class="p">()</span>
<span class="k">if</span> <span class="n">tolerance</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tolerance</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tolerance</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">tolerance</span><span class="p">)</span>
<span class="n">as_of_joined_table</span> <span class="o">=</span> <span class="n">left_table</span><span class="o">.</span><span class="n">_joinAsOf</span><span class="p">(</span>
<span class="n">right_table</span><span class="p">,</span>
<span class="n">leftAsOfColumn</span><span class="o">=</span><span class="n">left_as_of_column</span><span class="p">,</span>
<span class="n">rightAsOfColumn</span><span class="o">=</span><span class="n">right_as_of_column</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;left&quot;</span><span class="p">,</span>
<span class="n">tolerance</span><span class="o">=</span><span class="n">tolerance</span><span class="p">,</span>
<span class="n">allowExactMatches</span><span class="o">=</span><span class="n">allow_exact_matches</span><span class="p">,</span>
<span class="n">direction</span><span class="o">=</span><span class="n">direction</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Unpack suffixes tuple for convenience</span>
<span class="n">left_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">right_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="c1"># Append suffixes to columns with the same name to avoid conflicts later</span>
<span class="n">duplicate_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&amp;</span> <span class="nb">set</span><span class="p">(</span><span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span>
<span class="n">exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">def</span> <span class="nf">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span> <span class="c1"># type: ignore[valid-type]</span>
<span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span> <span class="c1"># type: ignore[valid-type]</span>
<span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span>
<span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="p">(</span><span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span>
<span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="p">(</span><span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">pass</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">left_suffix</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">left_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="c1"># recover `right_prefix` here.</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)[</span><span class="nb">len</span><span class="p">(</span><span class="n">right_prefix</span><span class="p">)</span> <span class="p">:]</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span>
<span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="p">(</span>
<span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span>
<span class="p">):</span>
<span class="k">continue</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">right_suffix</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">right_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="c1"># Retain indices if they are used for joining</span>
<span class="k">if</span> <span class="n">left_index</span> <span class="ow">or</span> <span class="n">right_index</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">left_index_scols</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_spark_column_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">left_index_scols</span><span class="p">)</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">selected_columns</span> <span class="o">=</span> <span class="n">as_of_joined_table</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">selected_columns</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_numeric"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_numeric.html#pyspark.pandas.to_numeric">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">&quot;raise&quot;</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to a numeric type.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : scalar, list, tuple, 1-d array, or Series</span>
<span class="sd"> Argument to be converted.</span>
<span class="sd"> errors : {&#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> * If &#39;coerce&#39;, then invalid parsing will be set as NaN.</span>
<span class="sd"> * If &#39;raise&#39;, then invalid parsing will raise an exception.</span>
<span class="sd"> * If &#39;ignore&#39;, then invalid parsing will return the input.</span>
<span class="sd"> .. note:: &#39;ignore&#39; doesn&#39;t work yet when `arg` is pandas-on-Spark Series.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : numeric if parsing succeeded.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> to_timedelta : Convert argument to timedelta.</span>
<span class="sd"> numpy.ndarray.astype : Cast a numpy array to a specified type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 -3</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(psser)</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 -3.0</span>
<span class="sd"> dtype: float32</span>
<span class="sd"> If given Series contains invalid value to cast float, just cast it to `np.nan`</span>
<span class="sd"> when `errors` is set to &quot;coerce&quot;.</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;apple&#39;, &#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 apple</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 -3</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(psser, errors=&quot;coerce&quot;)</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 -3.0</span>
<span class="sd"> dtype: float32</span>
<span class="sd"> Also support for list, tuple, np.array, or a scalar</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric((&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;))</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(np.array([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;]))</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(&#39;1.0&#39;)</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;coerce&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;float&quot;</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;raise&quot;</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">scol_casted</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;float&quot;</span><span class="p">)</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">assert_true</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">|</span> <span class="n">scol_casted</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">())</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">scol_casted</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;ignore&quot;</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;&#39;ignore&#39; is not implemented yet, when the `arg` is Series.&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;invalid error value specified&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">)</span></div>
<div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.broadcast.html#pyspark.pandas.broadcast">[docs]</a><span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Marks a DataFrame as small enough for use in broadcast joins.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`DataFrame.spark.hint` instead.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> obj : DataFrame</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : DataFrame with broadcast hint.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.merge : Merge DataFrame objects with a database-style join.</span>
<span class="sd"> DataFrame.join : Join columns of another DataFrame.</span>
<span class="sd"> DataFrame.update : Modify in place using non-NA values from another DataFrame.</span>
<span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame({&#39;lkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [1, 2, 3, 5]},</span>
<span class="sd"> ... columns=[&#39;lkey&#39;, &#39;value&#39;]).set_index(&#39;lkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame({&#39;rkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [5, 6, 7, 8]},</span>
<span class="sd"> ... columns=[&#39;rkey&#39;, &#39;value&#39;]).set_index(&#39;rkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; merged = df1.merge(ps.broadcast(df2), left_index=True, right_index=True)</span>
<span class="sd"> &gt;&gt;&gt; merged.spark.explain() # doctest: +ELLIPSIS</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> ...BroadcastHashJoin...</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`broadcast` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.hint` with &#39;broadcast&#39; for `name` parameter instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Invalid type : expected DataFrame got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">))</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_orc"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_orc.html#pyspark.pandas.read_orc">[docs]</a><span class="k">def</span> <span class="nf">read_orc</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Load an ORC object from the file path, returning a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : str</span>
<span class="sd"> The path string storing the ORC file to be read.</span>
<span class="sd"> columns : list, default None</span>
<span class="sd"> If not None, only these columns will be read from the file.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_orc(&#39;%s/read_spark_io/data.orc&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_orc(&#39;%s/read_spark_io/data.orc&#39; % path, columns=[&#39;id&#39;])</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_orc(&#39;%s/read_spark_io/data.orc&#39; % path, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_orc(&#39;%s/read_spark_io/data.orc&#39; % path, columns=[&#39;id&#39;], index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;orc&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">psdf_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span>
<span class="n">new_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">()</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">psdf_columns</span><span class="p">:</span>
<span class="n">new_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Unknown column name &#39;</span><span class="si">{}</span><span class="s2">&#39;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">column</span><span class="p">))</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<span class="k">def</span> <span class="nf">_get_index_map</span><span class="p">(</span>
<span class="n">sdf</span><span class="p">:</span> <span class="n">PySparkDataFrame</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">PySparkColumn</span><span class="p">]],</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]:</span>
<span class="n">index_spark_columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">PySparkColumn</span><span class="p">]]</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="n">sdf_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">sdf_columns</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span>
<span class="n">_get_dummies_default_accept_types</span> <span class="o">=</span> <span class="p">(</span><span class="n">DecimalType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">DateType</span><span class="p">)</span>
<span class="n">_get_dummies_acceptable_types</span> <span class="o">=</span> <span class="n">_get_dummies_default_accept_types</span> <span class="o">+</span> <span class="p">(</span>
<span class="n">ByteType</span><span class="p">,</span>
<span class="n">ShortType</span><span class="p">,</span>
<span class="n">IntegerType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">FloatType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">TimestampType</span><span class="p">,</span>
<span class="n">TimestampNTZType</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">shutil</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">import</span> <span class="nn">uuid</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.namespace</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;sf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">F</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.namespace tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">db_name</span> <span class="o">=</span> <span class="s2">&quot;db</span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;-&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;CREATE DATABASE </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;db&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">db_name</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP DATABASE IF EXISTS </span><span class="si">%s</span><span class="s2"> CASCADE&quot;</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>