| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.pandas.namespace — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/namespace';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/namespace.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/pandas/namespace.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/pandas/namespace.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.pandas.namespace</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.pandas.namespace</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">Wrappers around spark that correspond to common pandas functions.</span> |
| <span class="sd">"""</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Set</span><span class="p">,</span> |
| <span class="n">Sized</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">no_type_check</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Iterable</span> |
| <span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">tzinfo</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span> |
| <span class="kn">from</span> <span class="nn">io</span> <span class="kn">import</span> <span class="n">BytesIO</span> |
| <span class="kn">import</span> <span class="nn">json</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="n">is_datetime64_dtype</span><span class="p">,</span> |
| <span class="n">is_list_like</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pandas.tseries.offsets</span> <span class="kn">import</span> <span class="n">DateOffset</span> |
| <span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span> |
| <span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="k">as</span> <span class="nn">pq</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span> <span class="k">as</span> <span class="n">PySparkColumn</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">ByteType</span><span class="p">,</span> |
| <span class="n">ShortType</span><span class="p">,</span> |
| <span class="n">IntegerType</span><span class="p">,</span> |
| <span class="n">LongType</span><span class="p">,</span> |
| <span class="n">FloatType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">TimestampType</span><span class="p">,</span> |
| <span class="n">TimestampNTZType</span><span class="p">,</span> |
| <span class="n">DecimalType</span><span class="p">,</span> |
| <span class="n">StringType</span><span class="p">,</span> |
| <span class="n">DateType</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">DataType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">PySparkDataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.base</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">align_diff_frames</span><span class="p">,</span> |
| <span class="n">default_session</span><span class="p">,</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">same_anchor</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">validate_axis</span><span class="p">,</span> |
| <span class="n">log_advice</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">_reduce_spark_multi</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">,</span> |
| <span class="n">DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span><span class="p">,</span> <span class="n">DatetimeIndex</span><span class="p">,</span> <span class="n">TimedeltaIndex</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.multi</span> <span class="kn">import</span> <span class="n">MultiIndex</span> |
| |
| <span class="c1"># For Supporting Spark Connect</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">get_column_class</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="s2">"from_pandas"</span><span class="p">,</span> |
| <span class="s2">"range"</span><span class="p">,</span> |
| <span class="s2">"read_csv"</span><span class="p">,</span> |
| <span class="s2">"read_delta"</span><span class="p">,</span> |
| <span class="s2">"read_table"</span><span class="p">,</span> |
| <span class="s2">"read_spark_io"</span><span class="p">,</span> |
| <span class="s2">"read_parquet"</span><span class="p">,</span> |
| <span class="s2">"read_clipboard"</span><span class="p">,</span> |
| <span class="s2">"read_excel"</span><span class="p">,</span> |
| <span class="s2">"read_html"</span><span class="p">,</span> |
| <span class="s2">"to_datetime"</span><span class="p">,</span> |
| <span class="s2">"date_range"</span><span class="p">,</span> |
| <span class="s2">"to_timedelta"</span><span class="p">,</span> |
| <span class="s2">"timedelta_range"</span><span class="p">,</span> |
| <span class="s2">"get_dummies"</span><span class="p">,</span> |
| <span class="s2">"concat"</span><span class="p">,</span> |
| <span class="s2">"melt"</span><span class="p">,</span> |
| <span class="s2">"isna"</span><span class="p">,</span> |
| <span class="s2">"isnull"</span><span class="p">,</span> |
| <span class="s2">"notna"</span><span class="p">,</span> |
| <span class="s2">"notnull"</span><span class="p">,</span> |
| <span class="s2">"read_sql_table"</span><span class="p">,</span> |
| <span class="s2">"read_sql_query"</span><span class="p">,</span> |
| <span class="s2">"read_sql"</span><span class="p">,</span> |
| <span class="s2">"read_json"</span><span class="p">,</span> |
| <span class="s2">"merge"</span><span class="p">,</span> |
| <span class="s2">"merge_asof"</span><span class="p">,</span> |
| <span class="s2">"to_numeric"</span><span class="p">,</span> |
| <span class="s2">"broadcast"</span><span class="p">,</span> |
| <span class="s2">"read_orc"</span><span class="p">,</span> |
| <span class="p">]</span> |
| |
| |
| <span class="k">def</span> <span class="nf">from_pandas</span><span class="p">(</span><span class="n">pobj</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">Index</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Create a pandas-on-Spark DataFrame, Series or Index from a pandas DataFrame, Series or Index.</span> |
| |
| <span class="sd"> This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame,</span> |
| <span class="sd"> but this also works with pandas Series and picks the index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> pobj : pandas.DataFrame or pandas.Series</span> |
| <span class="sd"> pandas DataFrame or Series to read.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> If a pandas Series is passed in, this function returns a pandas-on-Spark Series.</span> |
| <span class="sd"> If a pandas DataFrame is passed in, this function returns a pandas-on-Spark DataFrame.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">Series</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">pobj</span><span class="p">))</span><span class="o">.</span><span class="n">index</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unknown data type: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| |
| |
| <span class="c1"># built-in range</span> |
| <span class="n">_range</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="nb">range</span><span class="p">]</span> <span class="o">=</span> <span class="nb">range</span> <span class="c1"># type: ignore[assignment]</span> |
| |
| |
| <div class="viewcode-block" id="range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.range.html#pyspark.pandas.range">[docs]</a><span class="k">def</span> <span class="nf">range</span><span class="p">(</span> |
| <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">num_partitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create a DataFrame with some range of numbers.</span> |
| |
| <span class="sd"> The resulting DataFrame has a single int64 column named `id`, containing elements in a range</span> |
| <span class="sd"> from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter</span> |
| <span class="sd"> (i.e. start) is specified, we treat it as the end value with the start value being 0.</span> |
| |
| <span class="sd"> This is like the range function in SparkSession and is used primarily for testing.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : int</span> |
| <span class="sd"> the start value (inclusive)</span> |
| <span class="sd"> end : int, optional</span> |
| <span class="sd"> the end value (exclusive)</span> |
| <span class="sd"> step : int, optional, default 1</span> |
| <span class="sd"> the incremental step</span> |
| <span class="sd"> num_partitions : int, optional</span> |
| <span class="sd"> the number of partitions of the DataFrame</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> When the first parameter is specified, we generate a range of values up till that number.</span> |
| |
| <span class="sd"> >>> ps.range(5)</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| |
| <span class="sd"> When start, end, and step are specified:</span> |
| |
| <span class="sd"> >>> ps.range(start = 100, end = 200, step = 20)</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 100</span> |
| <span class="sd"> 1 120</span> |
| <span class="sd"> 2 140</span> |
| <span class="sd"> 3 160</span> |
| <span class="sd"> 4 180</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="n">step</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="n">num_partitions</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_csv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_csv.html#pyspark.pandas.read_csv">[docs]</a><span class="k">def</span> <span class="nf">read_csv</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">","</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"infer"</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">quotechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Read CSV (comma-separated) file into DataFrame or Series.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str or list</span> |
| <span class="sd"> Path(s) of the CSV file(s) to be read.</span> |
| <span class="sd"> sep : str, default ‘,’</span> |
| <span class="sd"> Delimiter to use. Non empty string.</span> |
| <span class="sd"> header : int, default ‘infer’</span> |
| <span class="sd"> Whether to use the column names, and the start of the data.</span> |
| <span class="sd"> Default behavior is to infer the column names: if no names are passed</span> |
| <span class="sd"> the behavior is identical to `header=0` and column names are inferred from</span> |
| <span class="sd"> the first line of the file, if column names are passed explicitly then</span> |
| <span class="sd"> the behavior is identical to `header=None`. Explicitly pass `header=0` to be</span> |
| <span class="sd"> able to replace existing names</span> |
| <span class="sd"> names : str or array-like, optional</span> |
| <span class="sd"> List of column names to use. If file contains no header row, then you should</span> |
| <span class="sd"> explicitly pass `header=None`. Duplicates in this list will cause an error to be issued.</span> |
| <span class="sd"> If a string is given, it should be a DDL-formatted string in Spark SQL, which is</span> |
| <span class="sd"> preferred to avoid schema inference for better performance.</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> usecols : list-like or callable, optional</span> |
| <span class="sd"> Return a subset of the columns. If list-like, all elements must either be</span> |
| <span class="sd"> positional (i.e. integer indices into the document columns) or strings that</span> |
| <span class="sd"> correspond to column names provided either by the user in names or inferred</span> |
| <span class="sd"> from the document header row(s).</span> |
| <span class="sd"> If callable, the callable function will be evaluated against the column names,</span> |
| <span class="sd"> returning names where the callable function evaluates to `True`.</span> |
| <span class="sd"> dtype : Type name or dict of column -> type, default None</span> |
| <span class="sd"> Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} Use str or object</span> |
| <span class="sd"> together with suitable na_values settings to preserve and not interpret dtype.</span> |
| <span class="sd"> nrows : int, default None</span> |
| <span class="sd"> Number of rows to read from the CSV file.</span> |
| <span class="sd"> parse_dates : boolean or list of ints or names or list of lists or dict, default `False`.</span> |
| <span class="sd"> Currently only `False` is allowed.</span> |
| <span class="sd"> quotechar : str (length 1), optional</span> |
| <span class="sd"> The character used to denote the start and end of a quoted item. Quoted items can include</span> |
| <span class="sd"> the delimiter and it will be ignored.</span> |
| <span class="sd"> escapechar : str (length 1), default None</span> |
| <span class="sd"> One-character string used to escape other characters.</span> |
| <span class="sd"> comment: str, optional</span> |
| <span class="sd"> Indicates the line should not be parsed.</span> |
| <span class="sd"> encoding: str, optional</span> |
| <span class="sd"> Indicates the encoding to read file</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.read_csv('data.csv') # doctest: +SKIP</span> |
| |
| <span class="sd"> Load multiple CSV files as a single DataFrame:</span> |
| |
| <span class="sd"> >>> ps.read_csv(['data-01.csv', 'data-02.csv']) # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="c1"># For latin-1 encoding is same as iso-8859-1, that's why its mapped to iso-8859-1.</span> |
| <span class="n">encoding_mapping</span> <span class="o">=</span> <span class="p">{</span><span class="s2">"latin-1"</span><span class="p">:</span> <span class="s2">"iso-8859-1"</span><span class="p">}</span> |
| |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">parse_dates</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"parse_dates can only be `False`: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">parse_dates</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span> |
| <span class="n">usecols</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="c1"># type: ignore[assignment]</span> |
| |
| <span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"inferSchema"</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"sep"</span><span class="p">,</span> <span class="n">sep</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="s2">"infer"</span><span class="p">:</span> |
| <span class="n">header</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">names</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"header"</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"header"</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Unknown header argument </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">header</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">quotechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"quote"</span><span class="p">,</span> <span class="n">quotechar</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">escapechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"escape"</span><span class="p">,</span> <span class="n">escapechar</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">comment</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">comment</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">comment</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Only length-1 comment characters supported"</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"comment"</span><span class="p">,</span> <span class="n">comment</span><span class="p">)</span> |
| |
| <span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">encoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"encoding"</span><span class="p">,</span> <span class="n">encoding_mapping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">encoding</span><span class="p">,</span> <span class="n">encoding</span><span class="p">))</span> |
| |
| <span class="n">column_labels</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">names</span><span class="p">)</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">names</span><span class="p">):</span> |
| <span class="n">names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">names</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">names</span><span class="p">))</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Found non-unique column index"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"The number of names [</span><span class="si">%s</span><span class="s2">] does not match the number "</span> |
| <span class="s2">"of columns [</span><span class="si">%d</span><span class="s2">]. Try names by a Spark SQL DDL-formatted "</span> |
| <span class="s2">"string."</span> <span class="o">%</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">schema</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">enumerate</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span> |
| |
| <span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">missing</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">usecols</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="p">}</span> |
| <span class="n">missing</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span> |
| <span class="n">usecols_ints</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">usecols</span><span class="p">)</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">label</span><span class="p">:</span> <span class="n">col</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">())</span> |
| <span class="k">if</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">usecols_ints</span> |
| <span class="p">}</span> |
| <span class="n">missing</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">col</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols_ints</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">col</span> <span class="o">>=</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="ow">or</span> <span class="nb">list</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)[</span><span class="n">col</span><span class="p">]</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span> |
| <span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">usecols</span> |
| <span class="p">}</span> |
| <span class="n">missing</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span> <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span><span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"'usecols' must either be list-like of all strings, "</span> |
| <span class="s2">"all unicode, all integers or a callable."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Usecols do not match columns, columns expected but not "</span> <span class="s2">"found: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">missing</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">{}</span> |
| |
| <span class="k">if</span> <span class="n">nrows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">nrows</span><span class="p">)</span> |
| |
| <span class="n">index_spark_column_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">)):</span> |
| <span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">column_labels</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_col</span> |
| <span class="p">}</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If `index_col` is not specified for `read_csv`, "</span> |
| <span class="s2">"the default index is attached which can cause additional overhead."</span> |
| <span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">label</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">label</span><span class="p">,)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()],</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">tpe</span> <span class="ow">in</span> <span class="n">dtype</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">tpe</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| |
| <div class="viewcode-block" id="read_json"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_json.html#pyspark.pandas.read_json">[docs]</a><span class="k">def</span> <span class="nf">read_json</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert a JSON string to DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : string</span> |
| <span class="sd"> File path</span> |
| <span class="sd"> lines : bool, default True</span> |
| <span class="sd"> Read the file as a JSON object per line. It should be always True for now.</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([['a', 'b'], ['c', 'd']],</span> |
| <span class="sd"> ... columns=['col 1', 'col 2'])</span> |
| |
| <span class="sd"> >>> df.to_json(path=r'%s/read_json/foo.json' % path, num_files=1)</span> |
| <span class="sd"> >>> ps.read_json(</span> |
| <span class="sd"> ... path=r'%s/read_json/foo.json' % path</span> |
| <span class="sd"> ... ).sort_values(by="col 1")</span> |
| <span class="sd"> col 1 col 2</span> |
| <span class="sd"> 0 a b</span> |
| <span class="sd"> 1 c d</span> |
| |
| <span class="sd"> >>> df.to_json(path=r'%s/read_json/foo.json' % path, num_files=1, lineSep='___')</span> |
| <span class="sd"> >>> ps.read_json(</span> |
| <span class="sd"> ... path=r'%s/read_json/foo.json' % path, lineSep='___'</span> |
| <span class="sd"> ... ).sort_values(by="col 1")</span> |
| <span class="sd"> col 1 col 2</span> |
| <span class="sd"> 0 a b</span> |
| <span class="sd"> 1 c d</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> df.to_json(path=r'%s/read_json/bar.json' % path, num_files=1, index_col="index")</span> |
| <span class="sd"> >>> ps.read_json(</span> |
| <span class="sd"> ... path=r'%s/read_json/bar.json' % path, index_col="index"</span> |
| <span class="sd"> ... ).sort_values(by="col 1") # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> col 1 col 2</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 a b</span> |
| <span class="sd"> 1 c d</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If `index_col` is not specified for `read_json`, "</span> |
| <span class="s2">"the default index is attached which can cause additional overhead."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"lines=False is not implemented yet."</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"json"</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_delta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_delta.html#pyspark.pandas.read_delta">[docs]</a><span class="k">def</span> <span class="nf">read_delta</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">version</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">timestamp</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read a Delta Lake table on some file system and return a DataFrame.</span> |
| |
| <span class="sd"> If the Delta Lake table is already stored in the catalog (aka the metastore), use 'read_table'.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : string</span> |
| <span class="sd"> Path to the Delta Lake table.</span> |
| <span class="sd"> version : string, optional</span> |
| <span class="sd"> Specifies the table version (based on Delta's internal transaction version) to read from,</span> |
| <span class="sd"> using Delta's time travel feature. This sets Delta's 'versionAsOf' option. Note that</span> |
| <span class="sd"> this parameter and `timestamp` parameter cannot be used together, otherwise it will raise a</span> |
| <span class="sd"> `ValueError`.</span> |
| <span class="sd"> timestamp : string, optional</span> |
| <span class="sd"> Specifies the table version (based on timestamp) to read from,</span> |
| <span class="sd"> using Delta's time travel feature. This must be a valid date or timestamp string in Spark,</span> |
| <span class="sd"> and sets Delta's 'timestampAsOf' option. Note that this parameter and `version` parameter</span> |
| <span class="sd"> cannot be used together, otherwise it will raise a `ValueError`.</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> options</span> |
| <span class="sd"> Additional options that can be passed onto Delta.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_delta</span> |
| <span class="sd"> read_table</span> |
| <span class="sd"> read_spark_io</span> |
| <span class="sd"> read_parquet</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1).to_delta('%s/read_delta/foo' % path) # doctest: +SKIP</span> |
| <span class="sd"> >>> ps.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> >>> ps.range(10, 15, num_partitions=1).to_delta('%s/read_delta/foo' % path,</span> |
| <span class="sd"> ... mode='overwrite') # doctest: +SKIP</span> |
| <span class="sd"> >>> ps.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 10</span> |
| <span class="sd"> 1 11</span> |
| <span class="sd"> 2 12</span> |
| <span class="sd"> 3 13</span> |
| <span class="sd"> 4 14</span> |
| |
| <span class="sd"> >>> ps.read_delta('%s/read_delta/foo' % path, version=0) # doctest: +SKIP</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> ps.range(10, 15, num_partitions=1).to_delta(</span> |
| <span class="sd"> ... '%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP</span> |
| <span class="sd"> >>> ps.read_delta('%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP</span> |
| <span class="sd"> id</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 10</span> |
| <span class="sd"> 1 11</span> |
| <span class="sd"> 2 12</span> |
| <span class="sd"> 3 13</span> |
| <span class="sd"> 4 14</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If `index_col` is not specified for `read_delta`, "</span> |
| <span class="s2">"the default index is attached which can cause additional overhead."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"version and timestamp cannot be used together."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">options</span><span class="p">[</span><span class="s2">"versionAsOf"</span><span class="p">]</span> <span class="o">=</span> <span class="n">version</span> |
| <span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">options</span><span class="p">[</span><span class="s2">"timestampAsOf"</span><span class="p">]</span> <span class="o">=</span> <span class="n">timestamp</span> |
| <span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"delta"</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_table.html#pyspark.pandas.read_table">[docs]</a><span class="k">def</span> <span class="nf">read_table</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read a Spark table and return a DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : string</span> |
| <span class="sd"> Table name in Spark.</span> |
| |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_table</span> |
| <span class="sd"> read_delta</span> |
| <span class="sd"> read_parquet</span> |
| <span class="sd"> read_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1).to_table('%s.my_table' % db)</span> |
| <span class="sd"> >>> ps.read_table('%s.my_table' % db)</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> >>> ps.range(1).to_table('%s.my_table' % db, index_col="index")</span> |
| <span class="sd"> >>> ps.read_table('%s.my_table' % db, index_col="index") # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If `index_col` is not specified for `read_table`, "</span> |
| <span class="s2">"the default index is attached which can cause additional overhead."</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_spark_io"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_spark_io.html#pyspark.pandas.read_spark_io">[docs]</a><span class="k">def</span> <span class="nf">read_spark_io</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"StructType"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Load a DataFrame from a Spark data source.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : string, optional</span> |
| <span class="sd"> Path to the data source.</span> |
| <span class="sd"> format : string, optional</span> |
| <span class="sd"> Specifies the output data source format. Some common ones are:</span> |
| |
| <span class="sd"> - 'delta'</span> |
| <span class="sd"> - 'parquet'</span> |
| <span class="sd"> - 'orc'</span> |
| <span class="sd"> - 'json'</span> |
| <span class="sd"> - 'csv'</span> |
| <span class="sd"> schema : string or StructType, optional</span> |
| <span class="sd"> Input schema. If none, Spark tries to infer the schema automatically.</span> |
| <span class="sd"> The schema can either be a Spark StructType, or a DDL-formatted string like</span> |
| <span class="sd"> `col0 INT, col1 DOUBLE`.</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.read_table</span> |
| <span class="sd"> DataFrame.read_delta</span> |
| <span class="sd"> DataFrame.read_parquet</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1).spark.to_spark_io('%s/read_spark_io/data.parquet' % path)</span> |
| <span class="sd"> >>> ps.read_spark_io(</span> |
| <span class="sd"> ... '%s/read_spark_io/data.parquet' % path, format='parquet', schema='id long')</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> >>> ps.range(10, 15, num_partitions=1).spark.to_spark_io('%s/read_spark_io/data.json' % path,</span> |
| <span class="sd"> ... format='json', lineSep='__')</span> |
| <span class="sd"> >>> ps.read_spark_io(</span> |
| <span class="sd"> ... '%s/read_spark_io/data.json' % path, format='json', schema='id long', lineSep='__')</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 10</span> |
| <span class="sd"> 1 11</span> |
| <span class="sd"> 2 12</span> |
| <span class="sd"> 3 13</span> |
| <span class="sd"> 4 14</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> ps.range(10, 15, num_partitions=1).spark.to_spark_io('%s/read_spark_io/data.orc' % path,</span> |
| <span class="sd"> ... format='orc', index_col="index")</span> |
| <span class="sd"> >>> ps.read_spark_io(</span> |
| <span class="sd"> ... path=r'%s/read_spark_io/data.orc' % path, format="orc", index_col="index")</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 10</span> |
| <span class="sd"> 1 11</span> |
| <span class="sd"> 2 12</span> |
| <span class="sd"> 3 13</span> |
| <span class="sd"> 4 14</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_parquet"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_parquet.html#pyspark.pandas.read_parquet">[docs]</a><span class="k">def</span> <span class="nf">read_parquet</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">pandas_metadata</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Load a parquet object from the file path, returning a DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : string</span> |
| <span class="sd"> File path</span> |
| <span class="sd"> columns : list, default=None</span> |
| <span class="sd"> If not None, only these columns will be read from the file.</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> pandas_metadata : bool, default: False</span> |
| <span class="sd"> If True, try to respect the metadata if the Parquet file is written from pandas.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_parquet</span> |
| <span class="sd"> DataFrame.read_table</span> |
| <span class="sd"> DataFrame.read_delta</span> |
| <span class="sd"> DataFrame.read_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1).to_parquet('%s/read_spark_io/data.parquet' % path)</span> |
| <span class="sd"> >>> ps.read_parquet('%s/read_spark_io/data.parquet' % path, columns=['id'])</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> ps.range(1).to_parquet('%s/read_spark_io/data.parquet' % path, index_col="index")</span> |
| <span class="sd"> >>> ps.read_parquet('%s/read_spark_io/data.parquet' % path, columns=['id'], index_col="index")</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If `index_col` is not specified for `read_parquet`, "</span> |
| <span class="s2">"the default index is attached which can cause additional overhead."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| |
| <span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">pandas_metadata</span><span class="p">:</span> |
| <span class="c1"># Try to read pandas metadata</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span> |
| <span class="s2">"index_col array<string>, index_names array<string>"</span> |
| <span class="p">)</span> |
| <span class="k">def</span> <span class="nf">read_index_metadata</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">binary</span> <span class="o">=</span> <span class="n">pser</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">metadata</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">BufferReader</span><span class="p">(</span><span class="n">binary</span><span class="p">))</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">metadata</span> |
| <span class="k">if</span> <span class="sa">b</span><span class="s2">"pandas"</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span> |
| <span class="n">pandas_metadata</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">metadata</span><span class="p">[</span><span class="sa">b</span><span class="s2">"pandas"</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf8"</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">"index_columns"</span><span class="p">]):</span> |
| <span class="n">index_col</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">"index_columns"</span><span class="p">]:</span> |
| <span class="n">index_col</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">"columns"</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="n">column</span><span class="p">[</span><span class="s2">"field_name"</span><span class="p">]</span> <span class="o">==</span> <span class="n">col</span><span class="p">:</span> |
| <span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">[</span><span class="s2">"name"</span><span class="p">])</span> |
| <span class="k">break</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">"index_col"</span><span class="p">:</span> <span class="p">[</span><span class="n">index_col</span><span class="p">],</span> <span class="s2">"index_names"</span><span class="p">:</span> <span class="p">[</span><span class="n">index_names</span><span class="p">]})</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">"index_col"</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">],</span> <span class="s2">"index_names"</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">]})</span> |
| |
| <span class="n">index_col</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">default_session</span><span class="p">()</span> |
| <span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"binaryFile"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">read_index_metadata</span><span class="p">(</span><span class="s2">"content"</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"index_metadata"</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"index_metadata.*"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="n">options</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">new_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_columns</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">index_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">index_names</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| |
| <div class="viewcode-block" id="read_clipboard"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_clipboard.html#pyspark.pandas.read_clipboard">[docs]</a><span class="k">def</span> <span class="nf">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">"\s+"</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Read text from clipboard and pass to read_csv. See read_csv for the</span> |
| <span class="sd"> full argument list</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sep : str, default '\s+'</span> |
| <span class="sd"> A string or regex delimiter. The default of '\s+' denotes</span> |
| <span class="sd"> one or more whitespace characters.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_clipboard : Write text out to clipboard.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> parsed : DataFrame</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)))</span></div> |
| |
| |
| <div class="viewcode-block" id="read_excel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_excel.html#pyspark.pandas.read_excel">[docs]</a><span class="k">def</span> <span class="nf">read_excel</span><span class="p">(</span> |
| <span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> |
| <span class="n">sheet_name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">true_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">false_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">parse_dates</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">date_parser</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">thousands</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">skipfooter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwds</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read an Excel file into a pandas-on-Spark DataFrame or Series.</span> |
| |
| <span class="sd"> Support both `xls` and `xlsx` file extensions from a local filesystem or URL.</span> |
| <span class="sd"> Support an option to read a single sheet or a list of sheets.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book</span> |
| <span class="sd"> The string could be a URL. The value URL must be available in Spark's DataFrameReader.</span> |
| |
| <span class="sd"> .. note::</span> |
| <span class="sd"> If the underlying Spark is below 3.0, the parameter as a string is not supported.</span> |
| <span class="sd"> You can use `ps.from_pandas(pd.read_excel(...))` as a workaround.</span> |
| |
| <span class="sd"> sheet_name : str, int, list, or None, default 0</span> |
| <span class="sd"> Strings are used for sheet names. Integers are used in zero-indexed</span> |
| <span class="sd"> sheet positions. Lists of strings/integers are used to request</span> |
| <span class="sd"> multiple sheets. Specify None to get all sheets.</span> |
| |
| <span class="sd"> Available cases:</span> |
| |
| <span class="sd"> * Defaults to ``0``: 1st sheet as a `DataFrame`</span> |
| <span class="sd"> * ``1``: 2nd sheet as a `DataFrame`</span> |
| <span class="sd"> * ``"Sheet1"``: Load sheet with name "Sheet1"</span> |
| <span class="sd"> * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"</span> |
| <span class="sd"> as a dict of `DataFrame`</span> |
| <span class="sd"> * None: All sheets.</span> |
| |
| <span class="sd"> header : int, list of int, default 0</span> |
| <span class="sd"> Row (0-indexed) to use for the column labels of the parsed</span> |
| <span class="sd"> DataFrame. If a list of integers is passed those row positions will</span> |
| <span class="sd"> be combined into a ``MultiIndex``. Use None if there is no header.</span> |
| <span class="sd"> names : array-like, default None</span> |
| <span class="sd"> List of column names to use. If file contains no header row,</span> |
| <span class="sd"> then you should explicitly pass header=None.</span> |
| <span class="sd"> index_col : int, list of int, default None</span> |
| <span class="sd"> Column (0-indexed) to use as the row labels of the DataFrame.</span> |
| <span class="sd"> Pass None if there is no such column. If a list is passed,</span> |
| <span class="sd"> those columns will be combined into a ``MultiIndex``. If a</span> |
| <span class="sd"> subset of data is selected with ``usecols``, index_col</span> |
| <span class="sd"> is based on the subset.</span> |
| <span class="sd"> usecols : int, str, list-like, or callable default None</span> |
| <span class="sd"> Return a subset of the columns.</span> |
| |
| <span class="sd"> * If None, then parse all columns.</span> |
| <span class="sd"> * If str, then indicates comma separated list of Excel column letters</span> |
| <span class="sd"> and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of</span> |
| <span class="sd"> both sides.</span> |
| <span class="sd"> * If list of int, then indicates list of column numbers to be parsed.</span> |
| <span class="sd"> * If list of string, then indicates list of column names to be parsed.</span> |
| <span class="sd"> * If callable, then evaluate each column name against it and parse the</span> |
| <span class="sd"> column if the callable returns ``True``.</span> |
| <span class="sd"> dtype : Type name or dict of column -> type, default None</span> |
| <span class="sd"> Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}</span> |
| <span class="sd"> Use `object` to preserve data as stored in Excel and not interpret dtype.</span> |
| <span class="sd"> If converters are specified, they will be applied INSTEAD</span> |
| <span class="sd"> of dtype conversion.</span> |
| <span class="sd"> engine : str, default None</span> |
| <span class="sd"> If io is not a buffer or path, this must be set to identify io.</span> |
| <span class="sd"> Acceptable values are None or xlrd.</span> |
| <span class="sd"> converters : dict, default None</span> |
| <span class="sd"> Dict of functions for converting values in certain columns. Keys can</span> |
| <span class="sd"> either be integers or column labels, values are functions that take one</span> |
| <span class="sd"> input argument, the Excel cell content, and return the transformed</span> |
| <span class="sd"> content.</span> |
| <span class="sd"> true_values : list, default None</span> |
| <span class="sd"> Values to consider as True.</span> |
| <span class="sd"> false_values : list, default None</span> |
| <span class="sd"> Values to consider as False.</span> |
| <span class="sd"> skiprows : list-like</span> |
| <span class="sd"> Rows to skip at the beginning (0-indexed).</span> |
| <span class="sd"> nrows : int, default None</span> |
| <span class="sd"> Number of rows to parse.</span> |
| <span class="sd"> na_values : scalar, str, list-like, or dict, default None</span> |
| <span class="sd"> Additional strings to recognize as NA/NaN. If dict passed, specific</span> |
| <span class="sd"> per-column NA values. By default the following values are interpreted</span> |
| <span class="sd"> as NaN.</span> |
| <span class="sd"> keep_default_na : bool, default True</span> |
| <span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span> |
| <span class="sd"> values are overridden, otherwise they're appended to.</span> |
| <span class="sd"> verbose : bool, default False</span> |
| <span class="sd"> Indicate number of NA values placed in non-numeric columns.</span> |
| <span class="sd"> parse_dates : bool, list-like, or dict, default False</span> |
| <span class="sd"> The behavior is as follows:</span> |
| |
| <span class="sd"> * bool. If True -> try parsing the index.</span> |
| <span class="sd"> * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3</span> |
| <span class="sd"> each as a separate date column.</span> |
| <span class="sd"> * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as</span> |
| <span class="sd"> a single date column.</span> |
| <span class="sd"> * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call</span> |
| <span class="sd"> result 'foo'</span> |
| |
| <span class="sd"> If a column or index contains an unparseable date, the entire column or</span> |
| <span class="sd"> index will be returned unaltered as an object data type. For non-standard</span> |
| <span class="sd"> datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``</span> |
| |
| <span class="sd"> Note: A fast-path exists for iso8601-formatted dates.</span> |
| <span class="sd"> date_parser : function, optional</span> |
| <span class="sd"> Function to use for converting a sequence of string columns to an array of</span> |
| <span class="sd"> datetime instances. The default uses ``dateutil.parser.parser`` to do the</span> |
| <span class="sd"> conversion. pandas-on-Spark will try to call `date_parser` in three different ways,</span> |
| <span class="sd"> advancing to the next if an exception occurs: 1) Pass one or more arrays</span> |
| <span class="sd"> (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the</span> |
| <span class="sd"> string values from the columns defined by `parse_dates` into a single array</span> |
| <span class="sd"> and pass that; and 3) call `date_parser` once for each row using one or</span> |
| <span class="sd"> more strings (corresponding to the columns defined by `parse_dates`) as</span> |
| <span class="sd"> arguments.</span> |
| <span class="sd"> thousands : str, default None</span> |
| <span class="sd"> Thousands separator for parsing string columns to numeric. Note that</span> |
| <span class="sd"> this parameter is only necessary for columns stored as TEXT in Excel,</span> |
| <span class="sd"> any numeric columns will automatically be parsed, regardless of display</span> |
| <span class="sd"> format.</span> |
| <span class="sd"> comment : str, default None</span> |
| <span class="sd"> Comments out remainder of line. Pass a character or characters to this</span> |
| <span class="sd"> argument to indicate comments in the input file. Any data between the</span> |
| <span class="sd"> comment string and the end of the current line is ignored.</span> |
| <span class="sd"> skipfooter : int, default 0</span> |
| <span class="sd"> Rows at the end to skip (0-indexed).</span> |
| <span class="sd"> **kwds : optional</span> |
| <span class="sd"> Optional keyword arguments can be passed to ``TextFileReader``.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or dict of DataFrames</span> |
| <span class="sd"> DataFrame from the passed in Excel file. See notes in sheet_name</span> |
| <span class="sd"> argument for more information on when a dict of DataFrames is returned.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_excel : Write DataFrame to an Excel file.</span> |
| <span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span> |
| <span class="sd"> read_csv : Read a comma-separated values (csv) file into DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> The file can be read using the file name as string or an open file object:</span> |
| |
| <span class="sd"> >>> ps.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP</span> |
| <span class="sd"> Name Value</span> |
| <span class="sd"> 0 string1 1</span> |
| <span class="sd"> 1 string2 2</span> |
| <span class="sd"> 2 #Comment 3</span> |
| |
| <span class="sd"> >>> ps.read_excel(open('tmp.xlsx', 'rb'),</span> |
| <span class="sd"> ... sheet_name='Sheet3') # doctest: +SKIP</span> |
| <span class="sd"> Unnamed: 0 Name Value</span> |
| <span class="sd"> 0 0 string1 1</span> |
| <span class="sd"> 1 1 string2 2</span> |
| <span class="sd"> 2 2 #Comment 3</span> |
| |
| <span class="sd"> Index and header can be specified via the `index_col` and `header` arguments</span> |
| |
| <span class="sd"> >>> ps.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 0 NaN Name Value</span> |
| <span class="sd"> 1 0.0 string1 1</span> |
| <span class="sd"> 2 1.0 string2 2</span> |
| <span class="sd"> 3 2.0 #Comment 3</span> |
| |
| <span class="sd"> Column types are inferred but can be explicitly specified</span> |
| |
| <span class="sd"> >>> ps.read_excel('tmp.xlsx', index_col=0,</span> |
| <span class="sd"> ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP</span> |
| <span class="sd"> Name Value</span> |
| <span class="sd"> 0 string1 1.0</span> |
| <span class="sd"> 1 string2 2.0</span> |
| <span class="sd"> 2 #Comment 3.0</span> |
| |
| <span class="sd"> True, False, and NA values, and thousands separators have defaults,</span> |
| <span class="sd"> but can be explicitly specified, too. Supply the values you would like</span> |
| <span class="sd"> as strings or lists of strings!</span> |
| |
| <span class="sd"> >>> ps.read_excel('tmp.xlsx', index_col=0,</span> |
| <span class="sd"> ... na_values=['string1', 'string2']) # doctest: +SKIP</span> |
| <span class="sd"> Name Value</span> |
| <span class="sd"> 0 None 1</span> |
| <span class="sd"> 1 None 2</span> |
| <span class="sd"> 2 #Comment 3</span> |
| |
| <span class="sd"> Comment lines in the excel input file can be skipped using the `comment` kwarg</span> |
| |
| <span class="sd"> >>> ps.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP</span> |
| <span class="sd"> Name Value</span> |
| <span class="sd"> 0 string1 1.0</span> |
| <span class="sd"> 1 string2 2.0</span> |
| <span class="sd"> 2 None NaN</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">pd_read_excel</span><span class="p">(</span> |
| <span class="n">io_or_bin</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span> |
| <span class="n">io</span><span class="o">=</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="p">(</span><span class="nb">bytes</span><span class="p">,</span> <span class="nb">bytearray</span><span class="p">))</span> <span class="k">else</span> <span class="n">io_or_bin</span><span class="p">,</span> |
| <span class="n">sheet_name</span><span class="o">=</span><span class="n">sn</span><span class="p">,</span> |
| <span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span> |
| <span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> |
| <span class="n">usecols</span><span class="o">=</span><span class="n">usecols</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> |
| <span class="n">engine</span><span class="o">=</span><span class="n">engine</span><span class="p">,</span> |
| <span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span> |
| <span class="n">true_values</span><span class="o">=</span><span class="n">true_values</span><span class="p">,</span> |
| <span class="n">false_values</span><span class="o">=</span><span class="n">false_values</span><span class="p">,</span> |
| <span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span> |
| <span class="n">nrows</span><span class="o">=</span><span class="n">nrows</span><span class="p">,</span> |
| <span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span> |
| <span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span> |
| <span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">,</span> |
| <span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="n">date_parser</span><span class="o">=</span><span class="n">date_parser</span><span class="p">,</span> |
| <span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span> |
| <span class="n">comment</span><span class="o">=</span><span class="n">comment</span><span class="p">,</span> |
| <span class="n">skipfooter</span><span class="o">=</span><span class="n">skipfooter</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwds</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="c1"># 'binaryFile' format is available since Spark 3.0.0.</span> |
| <span class="n">binaries</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"binaryFile"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"content"</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> |
| <span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">binaries</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">single_file</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">binaries</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">io</span> |
| <span class="n">single_file</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="n">pdf_or_psers</span> <span class="o">=</span> <span class="n">pd_read_excel</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sheet_name</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">single_file</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">{</span> |
| <span class="n">sn</span><span class="p">:</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">}</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">read_excel_on_spark</span><span class="p">(</span> |
| <span class="n">pdf_or_pser</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> |
| <span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">))</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span> |
| <span class="n">as_nullable_spark_type</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">output_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">pd_read_excel</span><span class="p">(</span><span class="nb">bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sn</span><span class="p">)</span> <span class="k">for</span> <span class="nb">bin</span> <span class="ow">in</span> <span class="n">pdf</span><span class="p">[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]])</span> |
| |
| <span class="n">reset_index</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">reset_index</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">dt</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="k">if</span> <span class="n">is_datetime64_dtype</span><span class="p">(</span><span class="n">dt</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dt</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DatetimeTZDtype</span><span class="p">):</span> |
| <span class="k">continue</span> |
| <span class="n">reset_index</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">replace</span><span class="p">({</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">:</span> <span class="kc">None</span><span class="p">})</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">reset_index</span> |
| |
| <span class="c1"># Just positionally map the column names to given schema's.</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span><span class="p">)))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">default_session</span><span class="p">()</span> |
| <span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"binaryFile"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"content"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="k">lambda</span> <span class="n">iterator</span><span class="p">:</span> <span class="nb">map</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">iterator</span><span class="p">),</span> <span class="n">schema</span><span class="o">=</span><span class="n">return_schema</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">{</span> |
| <span class="n">sn</span><span class="p">:</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">sn</span><span class="p">)</span> <span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">}</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="n">sheet_name</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_html"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_html.html#pyspark.pandas.read_html">[docs]</a><span class="k">def</span> <span class="nf">read_html</span><span class="p">(</span> |
| <span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> |
| <span class="n">match</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">".+"</span><span class="p">,</span> |
| <span class="n">flavor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="nb">slice</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">attrs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">thousands</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">","</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"."</span><span class="p">,</span> |
| <span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">displayed_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Read HTML tables into a ``list`` of ``DataFrame`` objects.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> io : str or file-like</span> |
| <span class="sd"> A URL, a file-like object, or a raw string containing HTML. Note that</span> |
| <span class="sd"> lxml only accepts the http, FTP and file URL protocols. If you have a</span> |
| <span class="sd"> URL that starts with ``'https'`` you might try removing the ``'s'``.</span> |
| |
| <span class="sd"> .. deprecated:: 4.0.0</span> |
| <span class="sd"> Passing html literal strings is deprecated.</span> |
| <span class="sd"> Wrap literal string/bytes input in io.StringIO/io.BytesIO instead.</span> |
| |
| <span class="sd"> match : str or compiled regular expression, optional</span> |
| <span class="sd"> The set of tables containing text matching this regex or string will be</span> |
| <span class="sd"> returned. Unless the HTML is extremely simple you will probably need to</span> |
| <span class="sd"> pass a non-empty string here. Defaults to '.+' (match any non-empty</span> |
| <span class="sd"> string). The default value will return all tables contained on a page.</span> |
| <span class="sd"> This value is converted to a regular expression so that there is</span> |
| <span class="sd"> consistent behavior between Beautiful Soup and lxml.</span> |
| |
| <span class="sd"> flavor : str or None, container of strings</span> |
| <span class="sd"> The parsing engine to use. 'bs4' and 'html5lib' are synonymous with</span> |
| <span class="sd"> each other, they are both there for backwards compatibility. The</span> |
| <span class="sd"> default of ``None`` tries to use ``lxml`` to parse and if that fails it</span> |
| <span class="sd"> falls back on ``bs4`` + ``html5lib``.</span> |
| |
| <span class="sd"> header : int or list-like or None, optional</span> |
| <span class="sd"> The row (or list of rows for a :class:`~ps.MultiIndex`) to use to</span> |
| <span class="sd"> make the columns headers.</span> |
| |
| <span class="sd"> index_col : int or list-like or None, optional</span> |
| <span class="sd"> The column (or list of columns) to use to create the index.</span> |
| |
| <span class="sd"> skiprows : int or list-like or slice or None, optional</span> |
| <span class="sd"> 0-based. Number of rows to skip after parsing the column integer. If a</span> |
| <span class="sd"> sequence of integers or a slice is given, will skip the rows indexed by</span> |
| <span class="sd"> that sequence. Note that a single element sequence means 'skip the nth</span> |
| <span class="sd"> row' whereas an integer means 'skip n rows'.</span> |
| |
| <span class="sd"> attrs : dict or None, optional</span> |
| <span class="sd"> This is a dictionary of attributes that you can pass to use to identify</span> |
| <span class="sd"> the table in the HTML. These are not checked for validity before being</span> |
| <span class="sd"> passed to lxml or Beautiful Soup. However, these attributes must be</span> |
| <span class="sd"> valid HTML table attributes to work correctly. For example, ::</span> |
| |
| <span class="sd"> attrs = {'id': 'table'}</span> |
| |
| <span class="sd"> is a valid attribute dictionary because the 'id' HTML tag attribute is</span> |
| <span class="sd"> a valid HTML attribute for *any* HTML tag as per `this document</span> |
| <span class="sd"> <http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::</span> |
| |
| <span class="sd"> attrs = {'asdf': 'table'}</span> |
| |
| <span class="sd"> is *not* a valid attribute dictionary because 'asdf' is not a valid</span> |
| <span class="sd"> HTML attribute even if it is a valid XML attribute. Valid HTML 4.01</span> |
| <span class="sd"> table attributes can be found `here</span> |
| <span class="sd"> <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A</span> |
| <span class="sd"> working draft of the HTML 5 spec can be found `here</span> |
| <span class="sd"> <http://www.w3.org/TR/html-markup/table.html>`__. It contains the</span> |
| <span class="sd"> latest information on table attributes for the modern web.</span> |
| |
| <span class="sd"> parse_dates : bool, optional</span> |
| <span class="sd"> See :func:`~ps.read_csv` for more details.</span> |
| |
| <span class="sd"> thousands : str, optional</span> |
| <span class="sd"> Separator to use to parse thousands. Defaults to ``','``.</span> |
| |
| <span class="sd"> encoding : str or None, optional</span> |
| <span class="sd"> The encoding used to decode the web page. Defaults to ``None``.``None``</span> |
| <span class="sd"> preserves the previous encoding behavior, which depends on the</span> |
| <span class="sd"> underlying parser library (e.g., the parser library will try to use</span> |
| <span class="sd"> the encoding provided by the document).</span> |
| |
| <span class="sd"> decimal : str, default '.'</span> |
| <span class="sd"> Character to recognize as decimal point (example: use ',' for European</span> |
| <span class="sd"> data).</span> |
| |
| <span class="sd"> converters : dict, default None</span> |
| <span class="sd"> Dict of functions for converting values in certain columns. Keys can</span> |
| <span class="sd"> either be integers or column labels, values are functions that take one</span> |
| <span class="sd"> input argument, the cell (not column) content, and return the</span> |
| <span class="sd"> transformed content.</span> |
| |
| <span class="sd"> na_values : iterable, default None</span> |
| <span class="sd"> Custom NA values</span> |
| |
| <span class="sd"> keep_default_na : bool, default True</span> |
| <span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span> |
| <span class="sd"> values are overridden, otherwise they're appended to</span> |
| |
| <span class="sd"> displayed_only : bool, default True</span> |
| <span class="sd"> Whether elements with "display: none" should be parsed</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dfs : list of DataFrames</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_csv</span> |
| <span class="sd"> DataFrame.to_html</span> |
| <span class="sd"> """</span> |
| <span class="n">pdfs</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_html</span><span class="p">(</span> |
| <span class="n">io</span><span class="o">=</span><span class="n">io</span><span class="p">,</span> |
| <span class="n">match</span><span class="o">=</span><span class="n">match</span><span class="p">,</span> |
| <span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span><span class="p">,</span> |
| <span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> |
| <span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span> |
| <span class="n">attrs</span><span class="o">=</span><span class="n">attrs</span><span class="p">,</span> |
| <span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span> |
| <span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="o">=</span><span class="n">encoding</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="o">=</span><span class="n">decimal</span><span class="p">,</span> |
| <span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span> |
| <span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span> |
| <span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span> |
| <span class="n">displayed_only</span><span class="o">=</span><span class="n">displayed_only</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">],</span> <span class="p">[</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="k">for</span> <span class="n">pdf</span> <span class="ow">in</span> <span class="n">pdfs</span><span class="p">])</span></div> |
| |
| |
| <span class="c1"># TODO: add `coerce_float` and 'parse_dates' parameters</span> |
| <div class="viewcode-block" id="read_sql_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_table.html#pyspark.pandas.read_sql_table">[docs]</a><span class="k">def</span> <span class="nf">read_sql_table</span><span class="p">(</span> |
| <span class="n">table_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read SQL database table into a DataFrame.</span> |
| |
| <span class="sd"> Given a table name and a JDBC URI, returns a DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> table_name : str</span> |
| <span class="sd"> Name of SQL table in database.</span> |
| <span class="sd"> con : str</span> |
| <span class="sd"> A JDBC URI could be provided as str.</span> |
| |
| <span class="sd"> .. note:: The URI must be JDBC URI instead of Python's database URI.</span> |
| |
| <span class="sd"> schema : str, default None</span> |
| <span class="sd"> Name of SQL schema in database to query (if database flavor</span> |
| <span class="sd"> supports this). Uses default schema if None (default).</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Column(s) to set as index(MultiIndex).</span> |
| <span class="sd"> columns : list, default None</span> |
| <span class="sd"> List of column names to select from SQL table.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's JDBC data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A SQL table is returned as two-dimensional data structure with labeled</span> |
| <span class="sd"> axes.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span> |
| <span class="sd"> read_sql : Read SQL query or database table into a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.read_sql_table('table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"dbtable"</span><span class="p">,</span> <span class="n">table_name</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"url"</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"jdbc"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| |
| <span class="c1"># TODO: add `coerce_float`, `params`, and 'parse_dates' parameters</span> |
| <div class="viewcode-block" id="read_sql_query"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_query.html#pyspark.pandas.read_sql_query">[docs]</a><span class="k">def</span> <span class="nf">read_sql_query</span><span class="p">(</span> |
| <span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Read SQL query into a DataFrame.</span> |
| |
| <span class="sd"> Returns a DataFrame corresponding to the result set of the query</span> |
| <span class="sd"> string. Optionally provide an `index_col` parameter to use one of the</span> |
| <span class="sd"> columns as the index, otherwise default index will be used.</span> |
| |
| <span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sql : string SQL query</span> |
| <span class="sd"> SQL query to be executed.</span> |
| <span class="sd"> con : str</span> |
| <span class="sd"> A JDBC URI could be provided as str.</span> |
| |
| <span class="sd"> .. note:: The URI must be JDBC URI instead of Python's database URI.</span> |
| |
| <span class="sd"> index_col : string or list of strings, optional, default: None</span> |
| <span class="sd"> Column(s) to set as index(MultiIndex).</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's JDBC data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span> |
| <span class="sd"> read_sql</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.read_sql_query('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"query"</span><span class="p">,</span> <span class="n">sql</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"url"</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span> |
| <span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"jdbc"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <span class="c1"># TODO: add `coerce_float`, `params`, and 'parse_dates' parameters</span> |
| <div class="viewcode-block" id="read_sql"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql.html#pyspark.pandas.read_sql">[docs]</a><span class="k">def</span> <span class="nf">read_sql</span><span class="p">(</span> |
| <span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read SQL query or database table into a DataFrame.</span> |
| |
| <span class="sd"> This function is a convenience wrapper around ``read_sql_table`` and</span> |
| <span class="sd"> ``read_sql_query`` (for backward compatibility). It will delegate</span> |
| <span class="sd"> to the specific function depending on the provided input. A SQL query</span> |
| <span class="sd"> will be routed to ``read_sql_query``, while a database table name will</span> |
| <span class="sd"> be routed to ``read_sql_table``. Note that the delegated function might</span> |
| <span class="sd"> have more specific notes about their functionality not listed here.</span> |
| |
| <span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sql : string</span> |
| <span class="sd"> SQL query to be executed or a table name.</span> |
| <span class="sd"> con : str</span> |
| <span class="sd"> A JDBC URI could be provided as str.</span> |
| |
| <span class="sd"> .. note:: The URI must be JDBC URI instead of Python's database URI.</span> |
| |
| <span class="sd"> index_col : string or list of strings, optional, default: None</span> |
| <span class="sd"> Column(s) to set as index(MultiIndex).</span> |
| <span class="sd"> columns : list, default: None</span> |
| <span class="sd"> List of column names to select from SQL table (only used when reading</span> |
| <span class="sd"> a table).</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's JDBC data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span> |
| <span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.read_sql('table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP</span> |
| <span class="sd"> >>> ps.read_sql('SELECT * FROM table_name', 'jdbc:postgresql:db_name') # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">striped</span> <span class="o">=</span> <span class="n">sql</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> |
| <span class="k">if</span> <span class="s2">" "</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">striped</span><span class="p">:</span> <span class="c1"># TODO: identify the table name or not more precisely.</span> |
| <span class="k">return</span> <span class="n">read_sql_table</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">read_sql_query</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_datetime"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_datetime.html#pyspark.pandas.to_datetime">[docs]</a><span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">to_datetime</span><span class="p">(</span> |
| <span class="n">arg</span><span class="p">,</span> |
| <span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"raise"</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">infer_datetime_format</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">origin</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"unix"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert argument to datetime.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arg : integer, float, string, datetime, list, tuple, 1-d array, Series</span> |
| <span class="sd"> or DataFrame/dict-like</span> |
| |
| <span class="sd"> errors : {'ignore', 'raise', 'coerce'}, default 'raise'</span> |
| |
| <span class="sd"> - If 'raise', then invalid parsing will raise an exception</span> |
| <span class="sd"> - If 'coerce', then invalid parsing will be set as NaT</span> |
| <span class="sd"> - If 'ignore', then invalid parsing will return the input</span> |
| <span class="sd"> format : string, default None</span> |
| <span class="sd"> strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse</span> |
| <span class="sd"> all the way up to nanoseconds.</span> |
| <span class="sd"> unit : string, default None</span> |
| <span class="sd"> unit of the arg (D,s,ms,us,ns) denote the unit, which is an</span> |
| <span class="sd"> integer or float number. This will be based off the origin.</span> |
| <span class="sd"> Example, with unit='ms' and origin='unix' (the default), this</span> |
| <span class="sd"> would calculate the number of milliseconds to the unix epoch start.</span> |
| <span class="sd"> infer_datetime_format : boolean, default False</span> |
| <span class="sd"> If True and no `format` is given, attempt to infer the format of the</span> |
| <span class="sd"> datetime strings, and if it can be inferred, switch to a faster</span> |
| <span class="sd"> method of parsing them. In some cases this can increase the parsing</span> |
| <span class="sd"> speed by ~5-10x.</span> |
| <span class="sd"> origin : scalar, default 'unix'</span> |
| <span class="sd"> Define the reference date. The numeric values would be parsed as number</span> |
| <span class="sd"> of units (defined by `unit`) since this reference date.</span> |
| |
| <span class="sd"> - If 'unix' (or POSIX) time; origin is set to 1970-01-01.</span> |
| <span class="sd"> - If 'julian', unit must be 'D', and origin is set to beginning of</span> |
| <span class="sd"> Julian Calendar. Julian day number 0 is assigned to the day starting</span> |
| <span class="sd"> at noon on January 1, 4713 BC.</span> |
| <span class="sd"> - If Timestamp convertible, origin is set to Timestamp identified by</span> |
| <span class="sd"> origin.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ret : datetime if parsing succeeded.</span> |
| <span class="sd"> Return type depends on input:</span> |
| |
| <span class="sd"> - list-like: DatetimeIndex</span> |
| <span class="sd"> - Series: Series of datetime64 dtype</span> |
| <span class="sd"> - scalar: Timestamp</span> |
| |
| <span class="sd"> In case when it is not possible to return designated types (e.g. when</span> |
| <span class="sd"> any element of input is before Timestamp.min or after Timestamp.max)</span> |
| <span class="sd"> return will have datetime.datetime type (or corresponding</span> |
| <span class="sd"> array/Series).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Assembling a datetime from multiple columns of a DataFrame. The keys can be</span> |
| <span class="sd"> common abbreviations like ['year', 'month', 'day', 'minute', 'second',</span> |
| <span class="sd"> 'ms', 'us', 'ns']) or plurals of the same</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'year': [2015, 2016],</span> |
| <span class="sd"> ... 'month': [2, 3],</span> |
| <span class="sd"> ... 'day': [4, 5]})</span> |
| <span class="sd"> >>> ps.to_datetime(df)</span> |
| <span class="sd"> 0 2015-02-04</span> |
| <span class="sd"> 1 2016-03-05</span> |
| <span class="sd"> dtype: datetime64[ns]</span> |
| |
| <span class="sd"> If a date does not meet the `timestamp limitations</span> |
| <span class="sd"> <http://pandas.pydata.org/pandas-docs/stable/timeseries.html</span> |
| <span class="sd"> #timeseries-timestamp-limits>`_, passing errors='ignore'</span> |
| <span class="sd"> will return the original input instead of raising any exception.</span> |
| |
| <span class="sd"> Passing errors='coerce' will force an out-of-bounds date to NaT,</span> |
| <span class="sd"> in addition to forcing non-dates (or non-parseable dates) to NaT.</span> |
| |
| <span class="sd"> >>> ps.to_datetime('13000101', format='%Y%m%d', errors='ignore') # doctest: +SKIP</span> |
| <span class="sd"> datetime.datetime(1300, 1, 1, 0, 0)</span> |
| <span class="sd"> >>> ps.to_datetime('13000101', format='%Y%m%d', errors='coerce')</span> |
| <span class="sd"> NaT</span> |
| |
| <span class="sd"> Passing infer_datetime_format=True can often-times speedup a parsing</span> |
| <span class="sd"> if its not an ISO8601 format exactly, but in a regular format.</span> |
| |
| <span class="sd"> >>> s = ps.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)</span> |
| <span class="sd"> >>> s.head()</span> |
| <span class="sd"> 0 3/11/2000</span> |
| <span class="sd"> 1 3/12/2000</span> |
| <span class="sd"> 2 3/13/2000</span> |
| <span class="sd"> 3 3/11/2000</span> |
| <span class="sd"> 4 3/12/2000</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> import timeit</span> |
| <span class="sd"> >>> timeit.timeit(</span> |
| <span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=True)),</span> |
| <span class="sd"> ... number = 1) # doctest: +SKIP</span> |
| <span class="sd"> 0.35832712500000063</span> |
| |
| <span class="sd"> >>> timeit.timeit(</span> |
| <span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=False)),</span> |
| <span class="sd"> ... number = 1) # doctest: +SKIP</span> |
| <span class="sd"> 0.8895321660000004</span> |
| |
| <span class="sd"> Using a unix epoch time</span> |
| |
| <span class="sd"> >>> ps.to_datetime(1490195805, unit='s')</span> |
| <span class="sd"> Timestamp('2017-03-22 15:16:45')</span> |
| <span class="sd"> >>> ps.to_datetime(1490195805433502912, unit='ns')</span> |
| <span class="sd"> Timestamp('2017-03-22 15:16:45.433502912')</span> |
| |
| <span class="sd"> Using a non-unix epoch origin</span> |
| |
| <span class="sd"> >>> ps.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))</span> |
| <span class="sd"> DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># mappings for assembling units</span> |
| <span class="c1"># From pandas: pandas.core.tools.datetimes</span> |
| <span class="n">_unit_map</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="s2">"year"</span><span class="p">:</span> <span class="s2">"year"</span><span class="p">,</span> |
| <span class="s2">"years"</span><span class="p">:</span> <span class="s2">"year"</span><span class="p">,</span> |
| <span class="s2">"month"</span><span class="p">:</span> <span class="s2">"month"</span><span class="p">,</span> |
| <span class="s2">"months"</span><span class="p">:</span> <span class="s2">"month"</span><span class="p">,</span> |
| <span class="s2">"day"</span><span class="p">:</span> <span class="s2">"day"</span><span class="p">,</span> |
| <span class="s2">"days"</span><span class="p">:</span> <span class="s2">"day"</span><span class="p">,</span> |
| <span class="s2">"hour"</span><span class="p">:</span> <span class="s2">"h"</span><span class="p">,</span> |
| <span class="s2">"hours"</span><span class="p">:</span> <span class="s2">"h"</span><span class="p">,</span> |
| <span class="s2">"minute"</span><span class="p">:</span> <span class="s2">"m"</span><span class="p">,</span> |
| <span class="s2">"minutes"</span><span class="p">:</span> <span class="s2">"m"</span><span class="p">,</span> |
| <span class="s2">"second"</span><span class="p">:</span> <span class="s2">"s"</span><span class="p">,</span> |
| <span class="s2">"seconds"</span><span class="p">:</span> <span class="s2">"s"</span><span class="p">,</span> |
| <span class="s2">"ms"</span><span class="p">:</span> <span class="s2">"ms"</span><span class="p">,</span> |
| <span class="s2">"millisecond"</span><span class="p">:</span> <span class="s2">"ms"</span><span class="p">,</span> |
| <span class="s2">"milliseconds"</span><span class="p">:</span> <span class="s2">"ms"</span><span class="p">,</span> |
| <span class="s2">"us"</span><span class="p">:</span> <span class="s2">"us"</span><span class="p">,</span> |
| <span class="s2">"microsecond"</span><span class="p">:</span> <span class="s2">"us"</span><span class="p">,</span> |
| <span class="s2">"microseconds"</span><span class="p">:</span> <span class="s2">"us"</span><span class="p">,</span> |
| <span class="p">}</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_to_datetime</span><span class="p">(</span> |
| <span class="n">pser_or_pdf</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">datetime64</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">pser_or_pdf</span><span class="p">[</span><span class="n">cols</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span> |
| <span class="n">pser_or_pdf</span><span class="p">,</span> |
| <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span> |
| <span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span> |
| <span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span> |
| <span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span> |
| <span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">unit</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">_unit_map</span><span class="p">[</span><span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()]</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">arg</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span> <span class="k">if</span> <span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="ow">in</span> <span class="n">_unit_map</span><span class="p">}</span> |
| <span class="n">unit_rev</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">unit</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> |
| <span class="n">list_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">unit_rev</span><span class="p">[</span><span class="s2">"year"</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">"month"</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">"day"</span><span class="p">]]</span> |
| <span class="k">for</span> <span class="n">u</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"h"</span><span class="p">,</span> <span class="s2">"m"</span><span class="p">,</span> <span class="s2">"s"</span><span class="p">,</span> <span class="s2">"ms"</span><span class="p">,</span> <span class="s2">"us"</span><span class="p">]:</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">unit_rev</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">u</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">arg</span><span class="p">:</span> |
| <span class="n">list_cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">arg</span><span class="p">[</span><span class="n">list_cols</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">psdf</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">,</span> <span class="n">list_cols</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span> |
| <span class="n">arg</span><span class="p">,</span> |
| <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span> |
| <span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span> |
| <span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span> |
| <span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span> |
| <span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.date_range.html#pyspark.pandas.date_range">[docs]</a><span class="k">def</span> <span class="nf">date_range</span><span class="p">(</span> |
| <span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">tz</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">tzinfo</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">normalize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inclusive</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"both"</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DatetimeIndex</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return a fixed frequency DatetimeIndex.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : str or datetime-like, optional</span> |
| <span class="sd"> Left bound for generating dates.</span> |
| <span class="sd"> end : str or datetime-like, optional</span> |
| <span class="sd"> Right bound for generating dates.</span> |
| <span class="sd"> periods : int, optional</span> |
| <span class="sd"> Number of periods to generate.</span> |
| <span class="sd"> freq : str or DateOffset, default 'D'</span> |
| <span class="sd"> Frequency strings can have multiples, e.g. '5H'.</span> |
| <span class="sd"> tz : str or tzinfo, optional</span> |
| <span class="sd"> Time zone name for returning localized DatetimeIndex, for example</span> |
| <span class="sd"> 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is</span> |
| <span class="sd"> time zone naive.</span> |
| <span class="sd"> normalize : bool, default False</span> |
| <span class="sd"> Normalize start/end dates to midnight before generating date range.</span> |
| <span class="sd"> name : str, default None</span> |
| <span class="sd"> Name of the resulting DatetimeIndex.</span> |
| <span class="sd"> inclusive : {"both", "neither", "left", "right"}, default "both"</span> |
| <span class="sd"> Include boundaries; Whether to set each bound as closed or open.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> For compatibility. Has no effect on the result.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> rng : DatetimeIndex</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DatetimeIndex : An immutable container for datetimes.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span> |
| <span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span> |
| <span class="sd"> ``DatetimeIndex`` will have ``periods`` linearly spaced elements between</span> |
| <span class="sd"> ``start`` and ``end`` (closed on both sides).</span> |
| |
| <span class="sd"> To learn more about the frequency strings, please see `this link</span> |
| <span class="sd"> <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> **Specifying the values**</span> |
| |
| <span class="sd"> The next four examples generate the same `DatetimeIndex`, but vary</span> |
| <span class="sd"> the combination of `start`, `end` and `periods`.</span> |
| |
| <span class="sd"> Specify `start` and `end`, with the default daily frequency.</span> |
| |
| <span class="sd"> >>> ps.date_range(start='1/1/2018', end='1/08/2018') # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',</span> |
| <span class="sd"> '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Specify `start` and `periods`, the number of periods (days).</span> |
| |
| <span class="sd"> >>> ps.date_range(start='1/1/2018', periods=8) # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',</span> |
| <span class="sd"> '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Specify `end` and `periods`, the number of periods (days).</span> |
| |
| <span class="sd"> >>> ps.date_range(end='1/1/2018', periods=8) # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',</span> |
| <span class="sd"> '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Specify `start`, `end`, and `periods`; the frequency is generated</span> |
| <span class="sd"> automatically (linearly spaced).</span> |
| |
| <span class="sd"> >>> ps.date_range(</span> |
| <span class="sd"> ... start='2018-04-24', end='2018-04-27', periods=3</span> |
| <span class="sd"> ... ) # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',</span> |
| <span class="sd"> '2018-04-27 00:00:00'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> **Other Parameters**</span> |
| |
| <span class="sd"> Changed the `freq` (frequency) to ``'M'`` (month end frequency).</span> |
| |
| <span class="sd"> >>> ps.date_range(start='1/1/2018', periods=5, freq='M') # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',</span> |
| <span class="sd"> '2018-05-31'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Multiples are allowed</span> |
| |
| <span class="sd"> >>> ps.date_range(start='1/1/2018', periods=5, freq='3M') # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',</span> |
| <span class="sd"> '2019-01-31'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> `freq` can also be specified as an Offset object.</span> |
| |
| <span class="sd"> >>> ps.date_range(</span> |
| <span class="sd"> ... start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)</span> |
| <span class="sd"> ... ) # doctest: +SKIP</span> |
| <span class="sd"> DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',</span> |
| <span class="sd"> '2019-01-31'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> `inclusive` controls whether to include `start` and `end` that are on the</span> |
| <span class="sd"> boundary. The default includes boundary points on either end.</span> |
| |
| <span class="sd"> >>> ps.date_range(</span> |
| <span class="sd"> ... start='2017-01-01', end='2017-01-04', inclusive="both"</span> |
| <span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],</span> |
| <span class="sd"> dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.</span> |
| |
| <span class="sd"> >>> ps.date_range(</span> |
| <span class="sd"> ... start='2017-01-01', end='2017-01-04', inclusive='left'</span> |
| <span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)</span> |
| |
| <span class="sd"> Use ``inclusive='right'`` to exclude `start` if it falls on the boundary.</span> |
| |
| <span class="sd"> >>> ps.date_range(</span> |
| <span class="sd"> ... start='2017-01-01', end='2017-01-04', inclusive='right'</span> |
| <span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"N"</span><span class="p">,</span> <span class="s2">"ns"</span><span class="p">],</span> <span class="s2">"nanoseconds is not supported"</span> |
| <span class="k">assert</span> <span class="n">tz</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Localized DatetimeIndex is not supported"</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="n">DatetimeIndex</span><span class="p">,</span> |
| <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span> |
| <span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span> |
| <span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span> |
| <span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span> |
| <span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span> |
| <span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span> |
| <span class="n">tz</span><span class="o">=</span><span class="n">tz</span><span class="p">,</span> |
| <span class="n">normalize</span><span class="o">=</span><span class="n">normalize</span><span class="p">,</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> |
| <span class="n">inclusive</span><span class="o">=</span><span class="n">inclusive</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">),</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_timedelta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_timedelta.html#pyspark.pandas.to_timedelta">[docs]</a><span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">to_timedelta</span><span class="p">(</span> |
| <span class="n">arg</span><span class="p">,</span> |
| <span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"raise"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert argument to timedelta.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arg : str, timedelta, list-like or Series</span> |
| <span class="sd"> The data to be converted to timedelta.</span> |
| <span class="sd"> unit : str, optional</span> |
| <span class="sd"> Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``.</span> |
| |
| <span class="sd"> Possible values:</span> |
| <span class="sd"> * 'W'</span> |
| <span class="sd"> * 'D' / 'days' / 'day'</span> |
| <span class="sd"> * 'hours' / 'hour' / 'hr' / 'h'</span> |
| <span class="sd"> * 'm' / 'minute' / 'min' / 'minutes' / 'T'</span> |
| <span class="sd"> * 'S' / 'seconds' / 'sec' / 'second'</span> |
| <span class="sd"> * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L'</span> |
| <span class="sd"> * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U'</span> |
| <span class="sd"> * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N'</span> |
| |
| <span class="sd"> Must not be specified when `arg` context strings and ``errors="raise"``.</span> |
| |
| <span class="sd"> .. deprecated:: 4.0.0</span> |
| <span class="sd"> Units 'T' and 'L' are deprecated and will be removed in a future version.</span> |
| |
| <span class="sd"> errors : {'ignore', 'raise', 'coerce'}, default 'raise'</span> |
| <span class="sd"> - If 'raise', then invalid parsing will raise an exception.</span> |
| <span class="sd"> - If 'coerce', then invalid parsing will be set as NaT.</span> |
| <span class="sd"> - If 'ignore', then invalid parsing will return the input.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ret : timedelta64, TimedeltaIndex or Series of timedelta64 if parsing succeeded.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span> |
| <span class="sd"> to_datetime : Convert argument to datetime.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> If the precision is higher than nanoseconds, the precision of the duration is</span> |
| <span class="sd"> truncated to nanoseconds for string inputs.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Parsing a single string to a Timedelta:</span> |
| |
| <span class="sd"> >>> ps.to_timedelta('1 days 06:05:01.00003')</span> |
| <span class="sd"> Timedelta('1 days 06:05:01.000030')</span> |
| <span class="sd"> >>> ps.to_timedelta('15.5us') # doctest: +SKIP</span> |
| <span class="sd"> Timedelta('0 days 00:00:00.000015500')</span> |
| |
| <span class="sd"> Parsing a list or array of strings:</span> |
| |
| <span class="sd"> >>> ps.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) # doctest: +SKIP</span> |
| <span class="sd"> TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT],</span> |
| <span class="sd"> dtype='timedelta64[ns]', freq=None)</span> |
| |
| <span class="sd"> Converting numbers by specifying the `unit` keyword argument:</span> |
| |
| <span class="sd"> >>> ps.to_timedelta(np.arange(5), unit='s') # doctest: +SKIP</span> |
| <span class="sd"> TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02',</span> |
| <span class="sd"> '0 days 00:00:03', '0 days 00:00:04'],</span> |
| <span class="sd"> dtype='timedelta64[ns]', freq=None)</span> |
| <span class="sd"> >>> ps.to_timedelta(np.arange(5), unit='d') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],</span> |
| <span class="sd"> dtype='timedelta64[ns]', freq=None)</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_to_timedelta</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span> |
| <span class="n">arg</span><span class="o">=</span><span class="n">pser</span><span class="p">,</span> |
| <span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span> |
| <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">pandas_to_timedelta</span><span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span> |
| <span class="n">arg</span><span class="o">=</span><span class="n">arg</span><span class="p">,</span> |
| <span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span> |
| <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="timedelta_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.timedelta_range.html#pyspark.pandas.timedelta_range">[docs]</a><span class="k">def</span> <span class="nf">timedelta_range</span><span class="p">(</span> |
| <span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">closed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">TimedeltaIndex</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return a fixed frequency TimedeltaIndex, with day as the default frequency.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : str or timedelta-like, optional</span> |
| <span class="sd"> Left bound for generating timedeltas.</span> |
| <span class="sd"> end : str or timedelta-like, optional</span> |
| <span class="sd"> Right bound for generating timedeltas.</span> |
| <span class="sd"> periods : int, optional</span> |
| <span class="sd"> Number of periods to generate.</span> |
| <span class="sd"> freq : str or DateOffset, default 'D'</span> |
| <span class="sd"> Frequency strings can have multiples, e.g. '5H'.</span> |
| <span class="sd"> name : str, default None</span> |
| <span class="sd"> Name of the resulting TimedeltaIndex.</span> |
| <span class="sd"> closed : {None, 'left', 'right'}, optional</span> |
| <span class="sd"> Make the interval closed with respect to the given frequency to</span> |
| <span class="sd"> the 'left', 'right', or both sides (None, the default).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> TimedeltaIndex</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span> |
| <span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span> |
| <span class="sd"> ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between</span> |
| <span class="sd"> ``start`` and ``end`` (closed on both sides).</span> |
| |
| <span class="sd"> To learn more about the frequency strings, please see `this link</span> |
| <span class="sd"> <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.timedelta_range(start='1 day', periods=4) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None)</span> |
| |
| <span class="sd"> The closed parameter specifies which endpoint is included.</span> |
| <span class="sd"> The default behavior is to include both endpoints.</span> |
| |
| <span class="sd"> >>> ps.timedelta_range(start='1 day', periods=4, closed='right')</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TimedeltaIndex(['2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None)</span> |
| |
| <span class="sd"> The freq parameter specifies the frequency of the TimedeltaIndex.</span> |
| <span class="sd"> Only fixed frequencies can be passed, non-fixed frequencies such as ‘M’ (month end) will raise.</span> |
| |
| <span class="sd"> >>> ps.timedelta_range(start='1 day', end='2 days', freq='6H')</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',</span> |
| <span class="sd"> '1 days 18:00:00', '2 days 00:00:00'],</span> |
| <span class="sd"> dtype='timedelta64[ns]', freq=None)</span> |
| |
| <span class="sd"> Specify start, end, and periods; the frequency is generated automatically (linearly spaced).</span> |
| |
| <span class="sd"> >>> ps.timedelta_range(start='1 day', end='5 days', periods=4)</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',</span> |
| <span class="sd"> '5 days 00:00:00'],</span> |
| <span class="sd"> dtype='timedelta64[ns]', freq=None)</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"N"</span><span class="p">,</span> <span class="s2">"ns"</span><span class="p">],</span> <span class="s2">"nanoseconds is not supported"</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="n">TimedeltaIndex</span><span class="p">,</span> |
| <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span> |
| <span class="n">pd</span><span class="o">.</span><span class="n">timedelta_range</span><span class="p">(</span> |
| <span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span> |
| <span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span> |
| <span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span> |
| <span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> |
| <span class="n">closed</span><span class="o">=</span><span class="n">closed</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">),</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="get_dummies"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.get_dummies.html#pyspark.pandas.get_dummies">[docs]</a><span class="k">def</span> <span class="nf">get_dummies</span><span class="p">(</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">prefix</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">prefix_sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"_"</span><span class="p">,</span> |
| <span class="n">dummy_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sparse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">drop_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert categorical variable into dummy/indicator variables, also</span> |
| <span class="sd"> known as one hot encoding.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : array-like, Series, or DataFrame</span> |
| <span class="sd"> prefix : string, list of strings, or dict of strings, default None</span> |
| <span class="sd"> String to append DataFrame column names.</span> |
| <span class="sd"> Pass a list with length equal to the number of columns</span> |
| <span class="sd"> when calling get_dummies on a DataFrame. Alternatively, `prefix`</span> |
| <span class="sd"> can be a dictionary mapping column names to prefixes.</span> |
| <span class="sd"> prefix_sep : string, default '_'</span> |
| <span class="sd"> If appending prefix, separator/delimiter to use. Or pass a</span> |
| <span class="sd"> list or dictionary as with `prefix.`</span> |
| <span class="sd"> dummy_na : bool, default False</span> |
| <span class="sd"> Add a column to indicate NaNs, if False NaNs are ignored.</span> |
| <span class="sd"> columns : list-like, default None</span> |
| <span class="sd"> Column names in the DataFrame to be encoded.</span> |
| <span class="sd"> If `columns` is None then all the columns with</span> |
| <span class="sd"> `object` or `category` dtype will be converted.</span> |
| <span class="sd"> sparse : bool, default False</span> |
| <span class="sd"> Whether the dummy-encoded columns should be be backed by</span> |
| <span class="sd"> a :class:`SparseArray` (True) or a regular NumPy array (False).</span> |
| <span class="sd"> In pandas-on-Spark, this value must be "False".</span> |
| <span class="sd"> drop_first : bool, default False</span> |
| <span class="sd"> Whether to get k-1 dummies out of k categorical levels by removing the</span> |
| <span class="sd"> first level.</span> |
| <span class="sd"> dtype : dtype, default np.uint8</span> |
| <span class="sd"> Data type for new columns. Only a single dtype is allowed.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dummies : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.str.get_dummies</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(list('abca'))</span> |
| |
| <span class="sd"> >>> ps.get_dummies(s)</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 0 0</span> |
| <span class="sd"> 1 0 1 0</span> |
| <span class="sd"> 2 0 0 1</span> |
| <span class="sd"> 3 1 0 0</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],</span> |
| <span class="sd"> ... 'C': [1, 2, 3]},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> ps.get_dummies(df, prefix=['col1', 'col2'])</span> |
| <span class="sd"> C col1_a col1_b col2_a col2_b col2_c</span> |
| <span class="sd"> 0 1 1 0 0 1 0</span> |
| <span class="sd"> 1 2 0 1 1 0 0</span> |
| <span class="sd"> 2 3 1 0 0 0 1</span> |
| |
| <span class="sd"> >>> ps.get_dummies(ps.Series(list('abcaa')))</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 0 0</span> |
| <span class="sd"> 1 0 1 0</span> |
| <span class="sd"> 2 0 0 1</span> |
| <span class="sd"> 3 1 0 0</span> |
| <span class="sd"> 4 1 0 0</span> |
| |
| <span class="sd"> >>> ps.get_dummies(ps.Series(list('abcaa')), drop_first=True)</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> 0 0 0</span> |
| <span class="sd"> 1 1 0</span> |
| <span class="sd"> 2 0 1</span> |
| <span class="sd"> 3 0 0</span> |
| <span class="sd"> 4 0 0</span> |
| |
| <span class="sd"> >>> ps.get_dummies(ps.Series(list('abc')), dtype=float)</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1.0 0.0 0.0</span> |
| <span class="sd"> 1 0.0 1.0 0.0</span> |
| <span class="sd"> 2 0.0 0.0 1.0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">sparse</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"get_dummies currently does not support sparse"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Input must be a list-like for parameter `columns`"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="s2">"byte"</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">prefix</span><span class="p">)]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"get_dummies currently does not support prefix as string types"</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_default_accept_types</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span> <span class="o">==</span> <span class="n">columns</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="p">:])</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="s2">""</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="k">elif</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Expected tuple, got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">type</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">))</span><span class="o">.</span><span class="n">pop</span><span class="p">())</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">columns</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="o">==</span> <span class="n">key</span> <span class="ow">or</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">key</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"</span><span class="si">{}</span><span class="s2"> not in index"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| |
| <span class="n">column_labels_set</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="p">(</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_set</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"get_dummies currently only accept </span><span class="si">{}</span><span class="s2"> values"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="n">t</span><span class="p">)</span><span class="o">.</span><span class="n">typeName</span><span class="p">()</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Length of 'prefix' (</span><span class="si">{}</span><span class="s2">) did not match the length of "</span> |
| <span class="s2">"the columns being encoded (</span><span class="si">{}</span><span class="s2">)."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="n">prefix</span><span class="p">[</span><span class="n">column_label</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> <span class="k">for</span> <span class="n">column_label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| |
| <span class="n">all_values</span> <span class="o">=</span> <span class="n">_reduce_spark_multi</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">):</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="n">all_values</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">drop_first</span><span class="p">:</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span> |
| |
| <span class="k">def</span> <span class="nf">column_name</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Name</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">prefix</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="s2">""</span><span class="p">:</span> <span class="c1"># type: ignore[index]</span> |
| <span class="k">return</span> <span class="n">v</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s2">"</span><span class="si">{}{}{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">prefix</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">prefix_sep</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> <span class="c1"># type: ignore[index]</span> |
| |
| <span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">values</span><span class="p">:</span> |
| <span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">notnull</span><span class="p">()</span> <span class="o">&</span> <span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="o">==</span> <span class="n">value</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">dummy_na</span><span class="p">:</span> |
| <span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)))</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">remaining_columns</span><span class="p">]</span></div> |
| |
| |
| <span class="c1"># TODO: there are many parameters to implement and support. See pandas's pd.concat.</span> |
| <div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.concat.html#pyspark.pandas.concat">[docs]</a><span class="k">def</span> <span class="nf">concat</span><span class="p">(</span> |
| <span class="n">objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]],</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"outer"</span><span class="p">,</span> |
| <span class="n">ignore_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Concatenate pandas-on-Spark objects along a particular axis with optional set logic</span> |
| <span class="sd"> along the other axes.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> objs : a sequence of Series or DataFrame</span> |
| <span class="sd"> Any None objects will be dropped silently unless</span> |
| <span class="sd"> they are all None in which case a ValueError will be raised</span> |
| <span class="sd"> axis : {0/'index', 1/'columns'}, default 0</span> |
| <span class="sd"> The axis to concatenate along.</span> |
| <span class="sd"> join : {'inner', 'outer'}, default 'outer'</span> |
| <span class="sd"> How to handle indexes on other axis (or axes).</span> |
| <span class="sd"> ignore_index : bool, default False</span> |
| <span class="sd"> If True, do not use the index values along the concatenation axis. The</span> |
| <span class="sd"> resulting axis will be labeled 0, ..., n - 1. This is useful if you are</span> |
| <span class="sd"> concatenating objects where the concatenation axis does not have</span> |
| <span class="sd"> meaningful indexing information. Note the index values on the other</span> |
| <span class="sd"> axes are still respected in the join.</span> |
| <span class="sd"> sort : bool, default False</span> |
| <span class="sd"> Sort non-concatenation axis if it is not already aligned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> object, type of objs</span> |
| <span class="sd"> When concatenating all ``Series`` along the index (axis=0), a</span> |
| <span class="sd"> ``Series`` is returned. When ``objs`` contains at least one</span> |
| <span class="sd"> ``DataFrame``, a ``DataFrame`` is returned. When concatenating along</span> |
| <span class="sd"> the columns (axis=1), a ``DataFrame`` is returned.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.join : Join DataFrames using indexes.</span> |
| <span class="sd"> DataFrame.merge : Merge DataFrames by indexes or columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| |
| <span class="sd"> Combine two ``Series``.</span> |
| |
| <span class="sd"> >>> s1 = ps.Series(['a', 'b'])</span> |
| <span class="sd"> >>> s2 = ps.Series(['c', 'd'])</span> |
| <span class="sd"> >>> ps.concat([s1, s2])</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 0 c</span> |
| <span class="sd"> 1 d</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> Clear the existing index and reset it in the result</span> |
| <span class="sd"> by setting the ``ignore_index`` option to ``True``.</span> |
| |
| <span class="sd"> >>> ps.concat([s1, s2], ignore_index=True)</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> 3 d</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> Combine two ``DataFrame`` objects with identical columns.</span> |
| |
| <span class="sd"> >>> df1 = ps.DataFrame([['a', 1], ['b', 2]],</span> |
| <span class="sd"> ... columns=['letter', 'number'])</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> letter number</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> >>> df2 = ps.DataFrame([['c', 3], ['d', 4]],</span> |
| <span class="sd"> ... columns=['letter', 'number'])</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> letter number</span> |
| <span class="sd"> 0 c 3</span> |
| <span class="sd"> 1 d 4</span> |
| |
| <span class="sd"> >>> ps.concat([df1, df2])</span> |
| <span class="sd"> letter number</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> 0 c 3</span> |
| <span class="sd"> 1 d 4</span> |
| |
| <span class="sd"> Combine ``DataFrame`` and ``Series`` objects with different columns.</span> |
| |
| <span class="sd"> >>> ps.concat([df2, s1])</span> |
| <span class="sd"> letter number 0</span> |
| <span class="sd"> 0 c 3.0 None</span> |
| <span class="sd"> 1 d 4.0 None</span> |
| <span class="sd"> 0 None NaN a</span> |
| <span class="sd"> 1 None NaN b</span> |
| |
| <span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span> |
| <span class="sd"> and return everything. Columns outside the intersection will</span> |
| <span class="sd"> be filled with ``None`` values.</span> |
| |
| <span class="sd"> >>> df3 = ps.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],</span> |
| <span class="sd"> ... columns=['letter', 'number', 'animal'])</span> |
| <span class="sd"> >>> df3</span> |
| <span class="sd"> letter number animal</span> |
| <span class="sd"> 0 c 3 cat</span> |
| <span class="sd"> 1 d 4 dog</span> |
| |
| <span class="sd"> >>> ps.concat([df1, df3])</span> |
| <span class="sd"> letter number animal</span> |
| <span class="sd"> 0 a 1 None</span> |
| <span class="sd"> 1 b 2 None</span> |
| <span class="sd"> 0 c 3 cat</span> |
| <span class="sd"> 1 d 4 dog</span> |
| |
| <span class="sd"> Sort the columns.</span> |
| |
| <span class="sd"> >>> ps.concat([df1, df3], sort=True)</span> |
| <span class="sd"> animal letter number</span> |
| <span class="sd"> 0 None a 1</span> |
| <span class="sd"> 1 None b 2</span> |
| <span class="sd"> 0 cat c 3</span> |
| <span class="sd"> 1 dog d 4</span> |
| |
| <span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span> |
| <span class="sd"> and return only those that are shared by passing ``inner`` to</span> |
| <span class="sd"> the ``join`` keyword argument.</span> |
| |
| <span class="sd"> >>> ps.concat([df1, df3], join="inner")</span> |
| <span class="sd"> letter number</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> 0 c 3</span> |
| <span class="sd"> 1 d 4</span> |
| |
| <span class="sd"> >>> df4 = ps.DataFrame([['bird', 'polly'], ['monkey', 'george']],</span> |
| <span class="sd"> ... columns=['animal', 'name'])</span> |
| |
| <span class="sd"> Combine with column axis.</span> |
| |
| <span class="sd"> >>> ps.concat([df1, df4], axis=1)</span> |
| <span class="sd"> letter number animal name</span> |
| <span class="sd"> 0 a 1 bird polly</span> |
| <span class="sd"> 1 b 2 monkey george</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">objs</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">))</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">objs</span><span class="p">,</span> <span class="n">Iterable</span> |
| <span class="p">):</span> <span class="c1"># TODO: support dict</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"first argument must be an iterable of pandas-on-Spark "</span> |
| <span class="s2">"objects, you passed an object of type "</span> |
| <span class="s1">'"</span><span class="si">{name}</span><span class="s1">"'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Sized</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"No objects to concatenate"</span><span class="p">)</span> |
| <span class="n">objs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="n">obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"All objects passed were None"</span><span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"cannot concatenate object of type "</span> |
| <span class="s2">"'</span><span class="si">{name}</span><span class="s2">"</span> |
| <span class="s2">"; only ps.Series "</span> |
| <span class="s2">"and ps.DataFrame are valid"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">join</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"inner"</span><span class="p">,</span> <span class="s2">"outer"</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Only can inner (intersect) or outer (union) join the other axis."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">all</span><span class="p">([</span><span class="n">obj</span><span class="o">.</span><span class="n">empty</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">]):</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"The behavior of array concatenation with empty entries is "</span> |
| <span class="s2">"deprecated. In a future version, this will no longer exclude "</span> |
| <span class="s2">"empty items when determining the result dtype. "</span> |
| <span class="s2">"To retain the old behavior, exclude the empty entries before "</span> |
| <span class="s2">"the concat operation."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">obj</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span> |
| <span class="p">]</span> |
| |
| <span class="n">level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">)</span> |
| <span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">DataFrame</span><span class="o">.</span><span class="n">_index_normalized_frame</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">></span> <span class="n">level</span> |
| <span class="k">else</span> <span class="n">psdf</span> |
| <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span> |
| <span class="p">]</span> |
| |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">psdfs_not_same_anchor</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">1</span><span class="p">:]:</span> |
| <span class="n">duplicated</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">duplicated</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">pretty_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicated</span><span class="p">]</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Labels have to be unique; however, got duplicated labels </span><span class="si">%s</span><span class="s2">."</span> <span class="o">%</span> <span class="n">pretty_names</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">concat_psdf</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span> |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdfs_not_same_anchor</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdfs_not_same_anchor</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">resolve_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">AssertionError</span><span class="p">(</span><span class="s2">"This should not happen."</span><span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs_not_same_anchor</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">"inner"</span><span class="p">:</span> |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span> |
| <span class="n">resolve_func</span><span class="p">,</span> |
| <span class="n">concat_psdf</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"inner"</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">"outer"</span><span class="p">:</span> |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span> |
| <span class="n">resolve_func</span><span class="p">,</span> |
| <span class="n">concat_psdf</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"full"</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="nb">map</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">_range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)))</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span> |
| |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">columns</span><span class="o">.</span><span class="n">nlevels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="s2">"columns"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">concat_psdf</span> |
| |
| <span class="c1"># Series, Series ...</span> |
| <span class="c1"># We should return Series if objects are all Series.</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="nb">all</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">),</span> <span class="n">objs</span><span class="p">))</span> |
| |
| <span class="c1"># DataFrame, Series ... & Series, Series ...</span> |
| <span class="c1"># In this case, we should return DataFrame.</span> |
| <span class="n">new_objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">num_series</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="n">series_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">num_series</span> <span class="o">+=</span> <span class="mi">1</span> |
| <span class="n">series_names</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span><span class="n">DEFAULT_SERIES_NAME</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span> |
| <span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span> |
| |
| <span class="n">column_labels_levels</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels_levels</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"MultiIndex columns should have the same levels"</span><span class="p">)</span> |
| |
| <span class="c1"># DataFrame, DataFrame, ...</span> |
| <span class="c1"># All Series are converted into DataFrame and then compute concat.</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">indices_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">index</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span> |
| <span class="n">index_of_first_psdf</span> <span class="o">=</span> <span class="n">indices_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">index_of_psdf</span> <span class="ow">in</span> <span class="n">indices_of_psdfs</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span> <span class="o">!=</span> <span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Index type and names should be same in the objects to concatenate. "</span> |
| <span class="s2">"You passed different indices "</span> |
| <span class="s2">"</span><span class="si">{index_of_first_psdf}</span><span class="s2"> and </span><span class="si">{index_of_psdf}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">index_of_first_psdf</span><span class="o">=</span><span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span> |
| <span class="n">index_of_psdf</span><span class="o">=</span><span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">column_labels_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span> |
| <span class="n">index_names_of_psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]</span> |
| <span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[[]</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="n">name</span> <span class="o">==</span> <span class="n">index_names_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">index_names_of_psdfs</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="n">idx</span> <span class="o">==</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span> |
| <span class="p">):</span> |
| <span class="c1"># If all columns are in the same order and values, use it.</span> |
| <span class="n">psdfs</span> <span class="o">=</span> <span class="n">new_objs</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">"inner"</span><span class="p">:</span> |
| <span class="n">interested_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="o">*</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">set</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">column_labels_of_psdfs</span><span class="p">))</span> |
| <span class="c1"># Keep the column order with its firsts DataFrame.</span> |
| <span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">interested_columns</span> |
| <span class="p">]</span> |
| |
| <span class="c1"># If sort is True, sort to follow pandas 1.4+ behavior.</span> |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="c1"># FIXME: better ordering</span> |
| <span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span> |
| |
| <span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">]</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">"outer"</span><span class="p">:</span> |
| <span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">labels</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">:</span> |
| <span class="n">merged_columns</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">merged_columns</span><span class="p">)</span> |
| |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> |
| |
| <span class="c1"># If sort is True, always sort</span> |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="c1"># FIXME: better ordering</span> |
| <span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span> |
| |
| <span class="n">psdfs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">:</span> |
| <span class="n">columns_to_add</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| |
| <span class="c1"># TODO: NaN and None difference for missing values. pandas seems to be filling NaN.</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> |
| |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> <span class="o">+</span> <span class="p">[</span> |
| <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="o">+</span> <span class="n">columns_to_add</span><span class="p">),</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> <span class="o">+</span> <span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns_to_add</span><span class="p">))),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdfs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">])</span> |
| |
| <span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span> |
| <span class="p">]</span> |
| <span class="n">concatenated</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">y</span><span class="p">),</span> <span class="n">sdfs</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| |
| <span class="n">result_psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">concatenated</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="c1"># TODO: dtypes?</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="c1"># If all input were Series, we should return Series.</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_names</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">series_names</span><span class="o">.</span><span class="n">pop</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result_psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">result_psdf</span></div> |
| |
| |
| <div class="viewcode-block" id="melt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.melt.html#pyspark.pandas.melt">[docs]</a><span class="k">def</span> <span class="nf">melt</span><span class="p">(</span> |
| <span class="n">frame</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">id_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">var_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"value"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="n">id_vars</span><span class="p">,</span> <span class="n">value_vars</span><span class="p">,</span> <span class="n">var_name</span><span class="p">,</span> <span class="n">value_name</span><span class="p">)</span></div> |
| |
| |
| <span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| |
| <div class="viewcode-block" id="isna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.isna.html#pyspark.pandas.isna">[docs]</a><span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">isna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Detect missing values for an array-like object.</span> |
| |
| <span class="sd"> This function takes a scalar or array-like object and indicates</span> |
| <span class="sd"> whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``</span> |
| <span class="sd"> in object arrays).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> obj : scalar or array-like</span> |
| <span class="sd"> Object to check for null or missing values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool or array-like of bool</span> |
| <span class="sd"> For scalar input, returns a scalar boolean.</span> |
| <span class="sd"> For array input, returns an array of boolean indicating whether each</span> |
| <span class="sd"> corresponding element is missing.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.isna : Detect missing values in a Series.</span> |
| <span class="sd"> Series.isnull : Detect missing values in a Series.</span> |
| <span class="sd"> DataFrame.isna : Detect missing values in a DataFrame.</span> |
| <span class="sd"> DataFrame.isnull : Detect missing values in a DataFrame.</span> |
| <span class="sd"> Index.isna : Detect missing values in an Index.</span> |
| <span class="sd"> Index.isnull : Detect missing values in an Index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Scalar arguments (including strings) result in a scalar boolean.</span> |
| |
| <span class="sd"> >>> ps.isna('dog')</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> ps.isna(np.nan)</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> ndarrays result in an ndarray of booleans.</span> |
| |
| <span class="sd"> >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])</span> |
| <span class="sd"> >>> array</span> |
| <span class="sd"> array([[ 1., nan, 3.],</span> |
| <span class="sd"> [ 4., 5., nan]])</span> |
| <span class="sd"> >>> ps.isna(array)</span> |
| <span class="sd"> array([[False, True, False],</span> |
| <span class="sd"> [False, False, True]])</span> |
| |
| <span class="sd"> For Series and DataFrame, the same type is returned, containing booleans.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': ['ant', 'bee', 'cat'], 'b': ['dog', None, 'fly']})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 ant dog</span> |
| <span class="sd"> 1 bee None</span> |
| <span class="sd"> 2 cat fly</span> |
| |
| <span class="sd"> >>> ps.isna(df)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 False False</span> |
| <span class="sd"> 1 False True</span> |
| <span class="sd"> 2 False False</span> |
| |
| <span class="sd"> >>> ps.isnull(df.b)</span> |
| <span class="sd"> 0 False</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="c1"># TODO: Add back:</span> |
| <span class="c1"># notnull : Boolean inverse of pandas.isnull.</span> |
| <span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div> |
| |
| |
| <span class="n">isnull</span> <span class="o">=</span> <span class="n">isna</span> |
| |
| |
| <div class="viewcode-block" id="notna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.notna.html#pyspark.pandas.notna">[docs]</a><span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Detect existing (non-missing) values.</span> |
| |
| <span class="sd"> Return a boolean same-sized object indicating if the values are not NA.</span> |
| <span class="sd"> Non-missing values get mapped to True. NA values, such as None or</span> |
| <span class="sd"> :attr:`numpy.NaN`, get mapped to False values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool or array-like of bool</span> |
| <span class="sd"> Mask of bool values for each element that</span> |
| <span class="sd"> indicates whether an element is not an NA value.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> isna : Detect missing values for an array-like object.</span> |
| <span class="sd"> Series.notna : Boolean inverse of Series.isna.</span> |
| <span class="sd"> DataFrame.notnull : Boolean inverse of DataFrame.isnull.</span> |
| <span class="sd"> Index.notna : Boolean inverse of Index.isna.</span> |
| <span class="sd"> Index.notnull : Boolean inverse of Index.isnull.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Show which entries in a DataFrame are not NA.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'age': [5, 6, np.NaN],</span> |
| <span class="sd"> ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),</span> |
| <span class="sd"> ... pd.Timestamp('1940-04-25')],</span> |
| <span class="sd"> ... 'name': ['Alfred', 'Batman', ''],</span> |
| <span class="sd"> ... 'toy': [None, 'Batmobile', 'Joker']})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> age born name toy</span> |
| <span class="sd"> 0 5.0 NaT Alfred None</span> |
| <span class="sd"> 1 6.0 1939-05-27 Batman Batmobile</span> |
| <span class="sd"> 2 NaN 1940-04-25 Joker</span> |
| |
| <span class="sd"> >>> df.notnull()</span> |
| <span class="sd"> age born name toy</span> |
| <span class="sd"> 0 True False True False</span> |
| <span class="sd"> 1 True True True True</span> |
| <span class="sd"> 2 False True True True</span> |
| |
| <span class="sd"> Show which entries in a Series are not NA.</span> |
| |
| <span class="sd"> >>> ser = ps.Series([5, 6, np.NaN])</span> |
| <span class="sd"> >>> ser</span> |
| <span class="sd"> 0 5.0</span> |
| <span class="sd"> 1 6.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> ps.notna(ser)</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> >>> ps.notna(ser.index)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="c1"># TODO: Add back:</span> |
| <span class="c1"># Series.notnull :Boolean inverse of Series.isnull.</span> |
| <span class="c1"># DataFrame.notna :Boolean inverse of DataFrame.isna.</span> |
| <span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">notna</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div> |
| |
| |
| <span class="n">notnull</span> <span class="o">=</span> <span class="n">notna</span> |
| |
| |
| <div class="viewcode-block" id="merge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge.html#pyspark.pandas.merge">[docs]</a><span class="k">def</span> <span class="nf">merge</span><span class="p">(</span> |
| <span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">right</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"inner"</span><span class="p">,</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">"_x"</span><span class="p">,</span> <span class="s2">"_y"</span><span class="p">),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Merge DataFrame objects with a database-style join.</span> |
| |
| <span class="sd"> The index of the resulting DataFrame will be one of the following:</span> |
| <span class="sd"> - 0...n if no index is used for merging</span> |
| <span class="sd"> - Index of the left DataFrame if merged only on the index of the right DataFrame</span> |
| <span class="sd"> - Index of the right DataFrame if merged only on the index of the left DataFrame</span> |
| <span class="sd"> - All involved indices if merged using the indices of both DataFrames</span> |
| <span class="sd"> e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will</span> |
| <span class="sd"> be an index (x, a, b)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> right: Object to merge with.</span> |
| <span class="sd"> how: Type of merge to be performed.</span> |
| <span class="sd"> {'left', 'right', 'outer', 'inner'}, default 'inner'</span> |
| |
| <span class="sd"> left: use only keys from left frame, like a SQL left outer join; preserve key</span> |
| <span class="sd"> order.</span> |
| <span class="sd"> right: use only keys from right frame, like a SQL right outer join; preserve key</span> |
| <span class="sd"> order.</span> |
| <span class="sd"> outer: use union of keys from both frames, like a SQL full outer join; sort keys</span> |
| <span class="sd"> lexicographically.</span> |
| <span class="sd"> inner: use intersection of keys from both frames, like a SQL inner join;</span> |
| <span class="sd"> preserve the order of the left keys.</span> |
| <span class="sd"> on: Column or index level names to join on. These must be found in both DataFrames. If on</span> |
| <span class="sd"> is None and not merging on indexes then this defaults to the intersection of the</span> |
| <span class="sd"> columns in both DataFrames.</span> |
| <span class="sd"> left_on: Column or index level names to join on in the left DataFrame. Can also</span> |
| <span class="sd"> be an array or list of arrays of the length of the left DataFrame.</span> |
| <span class="sd"> These arrays are treated as if they are columns.</span> |
| <span class="sd"> right_on: Column or index level names to join on in the right DataFrame. Can also</span> |
| <span class="sd"> be an array or list of arrays of the length of the right DataFrame.</span> |
| <span class="sd"> These arrays are treated as if they are columns.</span> |
| <span class="sd"> left_index: Use the index from the left DataFrame as the join key(s). If it is a</span> |
| <span class="sd"> MultiIndex, the number of keys in the other DataFrame (either the index or a number of</span> |
| <span class="sd"> columns) must match the number of levels.</span> |
| <span class="sd"> right_index: Use the index from the right DataFrame as the join key. Same caveats as</span> |
| <span class="sd"> left_index.</span> |
| <span class="sd"> suffixes: Suffix to apply to overlapping column names in the left and right side,</span> |
| <span class="sd"> respectively.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A DataFrame of the two merged objects.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [1, 2, 3, 5]},</span> |
| <span class="sd"> ... columns=['lkey', 'value'])</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [5, 6, 7, 8]},</span> |
| <span class="sd"> ... columns=['rkey', 'value'])</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> lkey value</span> |
| <span class="sd"> 0 foo 1</span> |
| <span class="sd"> 1 bar 2</span> |
| <span class="sd"> 2 baz 3</span> |
| <span class="sd"> 3 foo 5</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> rkey value</span> |
| <span class="sd"> 0 foo 5</span> |
| <span class="sd"> 1 bar 6</span> |
| <span class="sd"> 2 baz 7</span> |
| <span class="sd"> 3 foo 8</span> |
| |
| <span class="sd"> Merge df1 and df2 on the lkey and rkey columns. The value columns have</span> |
| <span class="sd"> the default suffixes, _x and _y, appended.</span> |
| |
| <span class="sd"> >>> merged = ps.merge(df1, df2, left_on='lkey', right_on='rkey')</span> |
| <span class="sd"> >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) # doctest: +ELLIPSIS</span> |
| <span class="sd"> lkey value_x rkey value_y</span> |
| <span class="sd"> ...bar 2 bar 6</span> |
| <span class="sd"> ...baz 3 baz 7</span> |
| <span class="sd"> ...foo 1 foo 5</span> |
| <span class="sd"> ...foo 1 foo 8</span> |
| <span class="sd"> ...foo 5 foo 5</span> |
| <span class="sd"> ...foo 5 foo 8</span> |
| |
| <span class="sd"> >>> left_psdf = ps.DataFrame({'A': [1, 2]})</span> |
| <span class="sd"> >>> right_psdf = ps.DataFrame({'B': ['x', 'y']}, index=[1, 2])</span> |
| |
| <span class="sd"> >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 2 x</span> |
| |
| <span class="sd"> >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='left').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 None</span> |
| <span class="sd"> 1 2 x</span> |
| |
| <span class="sd"> >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='right').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 2.0 x</span> |
| <span class="sd"> 2 NaN y</span> |
| |
| <span class="sd"> >>> ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how='outer').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 None</span> |
| <span class="sd"> 1 2.0 x</span> |
| <span class="sd"> 2 NaN y</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> As described in #263, joining string columns currently returns None for missing values</span> |
| <span class="sd"> instead of NaN.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span> |
| <span class="n">right</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">,</span> |
| <span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span> |
| <span class="n">left_on</span><span class="o">=</span><span class="n">left_on</span><span class="p">,</span> |
| <span class="n">right_on</span><span class="o">=</span><span class="n">right_on</span><span class="p">,</span> |
| <span class="n">left_index</span><span class="o">=</span><span class="n">left_index</span><span class="p">,</span> |
| <span class="n">right_index</span><span class="o">=</span><span class="n">right_index</span><span class="p">,</span> |
| <span class="n">suffixes</span><span class="o">=</span><span class="n">suffixes</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="merge_asof"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge_asof.html#pyspark.pandas.merge_asof">[docs]</a><span class="k">def</span> <span class="nf">merge_asof</span><span class="p">(</span> |
| <span class="n">left</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">right</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">right_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">"_x"</span><span class="p">,</span> <span class="s2">"_y"</span><span class="p">),</span> |
| <span class="n">tolerance</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">allow_exact_matches</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">direction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"backward"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Perform an asof merge.</span> |
| |
| <span class="sd"> This is like a left-join except that we match on nearest</span> |
| <span class="sd"> key rather than equal keys.</span> |
| |
| <span class="sd"> For each row in the left DataFrame:</span> |
| |
| <span class="sd"> - A "backward" search selects the last row in the right DataFrame whose</span> |
| <span class="sd"> 'on' key is less than or equal to the left's key.</span> |
| |
| <span class="sd"> - A "forward" search selects the first row in the right DataFrame whose</span> |
| <span class="sd"> 'on' key is greater than or equal to the left's key.</span> |
| |
| <span class="sd"> - A "nearest" search selects the row in the right DataFrame who's 'on'</span> |
| <span class="sd"> key is closest in absolute distance to the left's key.</span> |
| |
| <span class="sd"> Optionally match on equivalent keys with 'by' before searching with 'on'.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : DataFrame or named Series</span> |
| <span class="sd"> right : DataFrame or named Series</span> |
| <span class="sd"> on : label</span> |
| <span class="sd"> Field name to join on. Must be found in both DataFrames.</span> |
| <span class="sd"> The data MUST be ordered. This must be a numeric column,</span> |
| <span class="sd"> such as datetimelike, integer, or float. On or left_on/right_on</span> |
| <span class="sd"> must be given.</span> |
| <span class="sd"> left_on : label</span> |
| <span class="sd"> Field name to join on in left DataFrame.</span> |
| <span class="sd"> right_on : label</span> |
| <span class="sd"> Field name to join on in right DataFrame.</span> |
| <span class="sd"> left_index : bool</span> |
| <span class="sd"> Use the index of the left DataFrame as the join key.</span> |
| <span class="sd"> right_index : bool</span> |
| <span class="sd"> Use the index of the right DataFrame as the join key.</span> |
| <span class="sd"> by : column name or list of column names</span> |
| <span class="sd"> Match on these columns before performing merge operation.</span> |
| <span class="sd"> left_by : column name</span> |
| <span class="sd"> Field names to match on in the left DataFrame.</span> |
| <span class="sd"> right_by : column name</span> |
| <span class="sd"> Field names to match on in the right DataFrame.</span> |
| <span class="sd"> suffixes : 2-length sequence (tuple, list, ...)</span> |
| <span class="sd"> Suffix to apply to overlapping column names in the left and right</span> |
| <span class="sd"> side, respectively.</span> |
| <span class="sd"> tolerance : int or Timedelta, optional, default None</span> |
| <span class="sd"> Select asof tolerance within this range; must be compatible</span> |
| <span class="sd"> with the merge index.</span> |
| <span class="sd"> allow_exact_matches : bool, default True</span> |
| |
| <span class="sd"> - If True, allow matching with the same 'on' value</span> |
| <span class="sd"> (i.e. less-than-or-equal-to / greater-than-or-equal-to)</span> |
| <span class="sd"> - If False, don't match the same 'on' value</span> |
| <span class="sd"> (i.e., strictly less-than / strictly greater-than).</span> |
| |
| <span class="sd"> direction : 'backward' (default), 'forward', or 'nearest'</span> |
| <span class="sd"> Whether to search for prior, subsequent, or closest matches.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> merged : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> merge : Merge with a database-style join.</span> |
| <span class="sd"> merge_ordered : Merge with optional filling/interpolation.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> left = ps.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})</span> |
| <span class="sd"> >>> left</span> |
| <span class="sd"> a left_val</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 1 5 b</span> |
| <span class="sd"> 2 10 c</span> |
| |
| <span class="sd"> >>> right = ps.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})</span> |
| <span class="sd"> >>> right</span> |
| <span class="sd"> a right_val</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 2 2</span> |
| <span class="sd"> 2 3 3</span> |
| <span class="sd"> 3 6 6</span> |
| <span class="sd"> 4 7 7</span> |
| |
| <span class="sd"> >>> ps.merge_asof(left, right, on="a").sort_values("a").reset_index(drop=True)</span> |
| <span class="sd"> a left_val right_val</span> |
| <span class="sd"> 0 1 a 1</span> |
| <span class="sd"> 1 5 b 3</span> |
| <span class="sd"> 2 10 c 7</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... left,</span> |
| <span class="sd"> ... right,</span> |
| <span class="sd"> ... on="a",</span> |
| <span class="sd"> ... allow_exact_matches=False</span> |
| <span class="sd"> ... ).sort_values("a").reset_index(drop=True)</span> |
| <span class="sd"> a left_val right_val</span> |
| <span class="sd"> 0 1 a NaN</span> |
| <span class="sd"> 1 5 b 3.0</span> |
| <span class="sd"> 2 10 c 7.0</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... left,</span> |
| <span class="sd"> ... right,</span> |
| <span class="sd"> ... on="a",</span> |
| <span class="sd"> ... direction="forward"</span> |
| <span class="sd"> ... ).sort_values("a").reset_index(drop=True)</span> |
| <span class="sd"> a left_val right_val</span> |
| <span class="sd"> 0 1 a 1.0</span> |
| <span class="sd"> 1 5 b 6.0</span> |
| <span class="sd"> 2 10 c NaN</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... left,</span> |
| <span class="sd"> ... right,</span> |
| <span class="sd"> ... on="a",</span> |
| <span class="sd"> ... direction="nearest"</span> |
| <span class="sd"> ... ).sort_values("a").reset_index(drop=True)</span> |
| <span class="sd"> a left_val right_val</span> |
| <span class="sd"> 0 1 a 1</span> |
| <span class="sd"> 1 5 b 6</span> |
| <span class="sd"> 2 10 c 7</span> |
| |
| <span class="sd"> We can use indexed DataFrames as well.</span> |
| |
| <span class="sd"> >>> left = ps.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])</span> |
| <span class="sd"> >>> left</span> |
| <span class="sd"> left_val</span> |
| <span class="sd"> 1 a</span> |
| <span class="sd"> 5 b</span> |
| <span class="sd"> 10 c</span> |
| |
| <span class="sd"> >>> right = ps.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])</span> |
| <span class="sd"> >>> right</span> |
| <span class="sd"> right_val</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 6 6</span> |
| <span class="sd"> 7 7</span> |
| |
| <span class="sd"> >>> ps.merge_asof(left, right, left_index=True, right_index=True).sort_index()</span> |
| <span class="sd"> left_val right_val</span> |
| <span class="sd"> 1 a 1</span> |
| <span class="sd"> 5 b 3</span> |
| <span class="sd"> 10 c 7</span> |
| |
| <span class="sd"> Here is a real-world times-series example</span> |
| |
| <span class="sd"> >>> quotes = ps.DataFrame(</span> |
| <span class="sd"> ... {</span> |
| <span class="sd"> ... "time": [</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.023"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.023"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.030"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.041"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.048"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.049"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.072"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.075")</span> |
| <span class="sd"> ... ],</span> |
| <span class="sd"> ... "ticker": [</span> |
| <span class="sd"> ... "GOOG",</span> |
| <span class="sd"> ... "MSFT",</span> |
| <span class="sd"> ... "MSFT",</span> |
| <span class="sd"> ... "MSFT",</span> |
| <span class="sd"> ... "GOOG",</span> |
| <span class="sd"> ... "AAPL",</span> |
| <span class="sd"> ... "GOOG",</span> |
| <span class="sd"> ... "MSFT"</span> |
| <span class="sd"> ... ],</span> |
| <span class="sd"> ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],</span> |
| <span class="sd"> ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]</span> |
| <span class="sd"> ... }</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> quotes</span> |
| <span class="sd"> time ticker bid ask</span> |
| <span class="sd"> 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93</span> |
| <span class="sd"> 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96</span> |
| <span class="sd"> 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98</span> |
| <span class="sd"> 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00</span> |
| <span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93</span> |
| <span class="sd"> 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01</span> |
| <span class="sd"> 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88</span> |
| <span class="sd"> 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03</span> |
| |
| <span class="sd"> >>> trades = ps.DataFrame(</span> |
| <span class="sd"> ... {</span> |
| <span class="sd"> ... "time": [</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.023"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.038"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.048"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.048"),</span> |
| <span class="sd"> ... pd.Timestamp("2016-05-25 13:30:00.048")</span> |
| <span class="sd"> ... ],</span> |
| <span class="sd"> ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],</span> |
| <span class="sd"> ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],</span> |
| <span class="sd"> ... "quantity": [75, 155, 100, 100, 100]</span> |
| <span class="sd"> ... }</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> trades</span> |
| <span class="sd"> time ticker price quantity</span> |
| <span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75</span> |
| <span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155</span> |
| <span class="sd"> 2 2016-05-25 13:30:00.048 GOOG 720.77 100</span> |
| <span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.92 100</span> |
| <span class="sd"> 4 2016-05-25 13:30:00.048 AAPL 98.00 100</span> |
| |
| <span class="sd"> By default we are taking the asof of the quotes</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... trades, quotes, on="time", by="ticker"</span> |
| <span class="sd"> ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)</span> |
| <span class="sd"> time ticker price quantity bid ask</span> |
| <span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span> |
| <span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span> |
| <span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span> |
| <span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span> |
| <span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span> |
| |
| <span class="sd"> We only asof within 2ms between the quote time and the trade time</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... trades,</span> |
| <span class="sd"> ... quotes,</span> |
| <span class="sd"> ... on="time",</span> |
| <span class="sd"> ... by="ticker",</span> |
| <span class="sd"> ... tolerance=sf.expr("INTERVAL 2 MILLISECONDS") # pd.Timedelta("2ms")</span> |
| <span class="sd"> ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)</span> |
| <span class="sd"> time ticker price quantity bid ask</span> |
| <span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span> |
| <span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN</span> |
| <span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span> |
| <span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span> |
| <span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span> |
| |
| <span class="sd"> We only asof within 10ms between the quote time and the trade time</span> |
| <span class="sd"> and we exclude exact matches on time. However *prior* data will</span> |
| <span class="sd"> propagate forward</span> |
| |
| <span class="sd"> >>> ps.merge_asof(</span> |
| <span class="sd"> ... trades,</span> |
| <span class="sd"> ... quotes,</span> |
| <span class="sd"> ... on="time",</span> |
| <span class="sd"> ... by="ticker",</span> |
| <span class="sd"> ... tolerance=sf.expr("INTERVAL 10 MILLISECONDS"), # pd.Timedelta("10ms")</span> |
| <span class="sd"> ... allow_exact_matches=False</span> |
| <span class="sd"> ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True)</span> |
| <span class="sd"> time ticker price quantity bid ask</span> |
| <span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN</span> |
| <span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span> |
| <span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span> |
| <span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN</span> |
| <span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">to_list</span><span class="p">(</span><span class="n">os</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="n">os</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">[]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">os</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">os</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">os</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">[(</span><span class="n">os</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">o</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">o</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">o</span><span class="p">,)</span> <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">os</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">left</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">on</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">left_on</span> <span class="ow">or</span> <span class="n">right_on</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s1">'Can only pass argument "on" OR "left_on" and "right_on", '</span> |
| <span class="s2">"not a combination of both."</span> |
| <span class="p">)</span> |
| <span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span> |
| <span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">left_index</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"left can only have one index"</span><span class="p">)</span> |
| <span class="n">left_as_of_names</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_on</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="n">right_index</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"right can only have one index"</span><span class="p">)</span> |
| <span class="n">right_as_of_names</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_on</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must pass right_on or right_index=True"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">right_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_as_of_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must pass left_on or left_index=True"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span> |
| <span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">common</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"No common columns to perform merge on. Merge options: "</span> |
| <span class="s2">"left_on=None, right_on=None, left_index=False, right_index=False"</span> |
| <span class="p">)</span> |
| <span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span> |
| <span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"can only asof on a key for left"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"can only asof on a key for right"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">by</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">left_by</span> <span class="ow">or</span> <span class="n">right_by</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'Can only pass argument "by" OR "left_by" and "right_by".'</span><span class="p">)</span> |
| <span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span> |
| <span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_by</span><span class="p">)))</span> |
| <span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_by</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_join_on_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"missing right_by"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">right_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_join_on_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"missing left_by"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_join_on_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_join_on_names</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"left_by and right_by must be same length"</span><span class="p">)</span> |
| |
| <span class="c1"># We should distinguish the name to avoid ambiguous column name after merging.</span> |
| <span class="n">right_prefix</span> <span class="o">=</span> <span class="s2">"__right_"</span> |
| <span class="n">right_as_of_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_as_of_name</span> <span class="k">for</span> <span class="n">right_as_of_name</span> <span class="ow">in</span> <span class="n">right_as_of_names</span><span class="p">]</span> |
| <span class="n">right_join_on_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_join_on_name</span> <span class="k">for</span> <span class="n">right_join_on_name</span> <span class="ow">in</span> <span class="n">right_join_on_names</span> |
| <span class="p">]</span> |
| |
| <span class="n">left_as_of_name</span> <span class="o">=</span> <span class="n">left_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">right_as_of_name</span> <span class="o">=</span> <span class="n">right_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="nf">resolve</span><span class="p">(</span><span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">side</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">def</span> <span class="nf">rename</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s2">"__</span><span class="si">{}</span><span class="s2">_</span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">side</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">HIDDEN_COLUMNS</span> |
| <span class="p">],</span> |
| <span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="n">left_internal</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">right_internal</span> <span class="o">=</span> <span class="n">resolve</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="s2">"right"</span><span class="p">)</span> |
| |
| <span class="n">left_table</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"left_table"</span><span class="p">)</span> |
| <span class="n">right_table</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"right_table"</span><span class="p">)</span> |
| |
| <span class="n">left_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">left_as_of_name</span><span class="p">)</span> |
| <span class="n">right_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">right_as_of_name</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">left_join_on_names</span><span class="p">:</span> |
| <span class="n">left_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_join_on_names</span><span class="p">]</span> |
| <span class="n">right_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_join_on_names</span><span class="p">]</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span><span class="p">:</span> <span class="n">lft</span> <span class="o">&</span> <span class="n">rgt</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">lft</span> <span class="o">==</span> <span class="n">rgt</span> <span class="k">for</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_join_on_columns</span><span class="p">,</span> <span class="n">right_join_on_columns</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">Column</span> <span class="o">=</span> <span class="n">get_column_class</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">tolerance</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tolerance</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">tolerance</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">tolerance</span><span class="p">)</span> |
| |
| <span class="n">as_of_joined_table</span> <span class="o">=</span> <span class="n">left_table</span><span class="o">.</span><span class="n">_joinAsOf</span><span class="p">(</span> |
| <span class="n">right_table</span><span class="p">,</span> |
| <span class="n">leftAsOfColumn</span><span class="o">=</span><span class="n">left_as_of_column</span><span class="p">,</span> |
| <span class="n">rightAsOfColumn</span><span class="o">=</span><span class="n">right_as_of_column</span><span class="p">,</span> |
| <span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"left"</span><span class="p">,</span> |
| <span class="n">tolerance</span><span class="o">=</span><span class="n">tolerance</span><span class="p">,</span> |
| <span class="n">allowExactMatches</span><span class="o">=</span><span class="n">allow_exact_matches</span><span class="p">,</span> |
| <span class="n">direction</span><span class="o">=</span><span class="n">direction</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Unpack suffixes tuple for convenience</span> |
| <span class="n">left_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">right_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> |
| |
| <span class="c1"># Append suffixes to columns with the same name to avoid conflicts later</span> |
| <span class="n">duplicate_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&</span> <span class="nb">set</span><span class="p">(</span><span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> |
| |
| <span class="n">exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="k">def</span> <span class="nf">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> <span class="c1"># type: ignore[valid-type]</span> |
| <span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> <span class="c1"># type: ignore[valid-type]</span> |
| <span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span> |
| <span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="p">(</span><span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="p">(</span><span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">pass</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">left_suffix</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">left_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="c1"># recover `right_prefix` here.</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)[</span><span class="nb">len</span><span class="p">(</span><span class="n">right_prefix</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span> |
| <span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span> |
| <span class="p">):</span> |
| <span class="k">continue</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">right_suffix</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">right_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="c1"># Retain indices if they are used for joining</span> |
| <span class="k">if</span> <span class="n">left_index</span> <span class="ow">or</span> <span class="n">right_index</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="n">left_index_scols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_spark_column_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">left_index_scols</span><span class="p">)</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="n">selected_columns</span> <span class="o">=</span> <span class="n">as_of_joined_table</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">selected_columns</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_numeric"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_numeric.html#pyspark.pandas.to_numeric">[docs]</a><span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">"raise"</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert argument to a numeric type.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arg : scalar, list, tuple, 1-d array, or Series</span> |
| <span class="sd"> Argument to be converted.</span> |
| <span class="sd"> errors : {'raise', 'coerce'}, default 'raise'</span> |
| <span class="sd"> * If 'coerce', then invalid parsing will be set as NaN.</span> |
| <span class="sd"> * If 'raise', then invalid parsing will raise an exception.</span> |
| <span class="sd"> * If 'ignore', then invalid parsing will return the input.</span> |
| |
| <span class="sd"> .. note:: 'ignore' doesn't work yet when `arg` is pandas-on-Spark Series.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ret : numeric if parsing succeeded.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span> |
| <span class="sd"> to_datetime : Convert argument to datetime.</span> |
| <span class="sd"> to_timedelta : Convert argument to timedelta.</span> |
| <span class="sd"> numpy.ndarray.astype : Cast a numpy array to a specified type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> psser = ps.Series(['1.0', '2', '-3'])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 -3</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> ps.to_numeric(psser)</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 -3.0</span> |
| <span class="sd"> dtype: float32</span> |
| |
| <span class="sd"> If given Series contains invalid value to cast float, just cast it to `np.nan`</span> |
| <span class="sd"> when `errors` is set to "coerce".</span> |
| |
| <span class="sd"> >>> psser = ps.Series(['apple', '1.0', '2', '-3'])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 apple</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 -3</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> ps.to_numeric(psser, errors="coerce")</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 -3.0</span> |
| <span class="sd"> dtype: float32</span> |
| |
| <span class="sd"> Also support for list, tuple, np.array, or a scalar</span> |
| |
| <span class="sd"> >>> ps.to_numeric(['1.0', '2', '-3'])</span> |
| <span class="sd"> array([ 1., 2., -3.])</span> |
| |
| <span class="sd"> >>> ps.to_numeric(('1.0', '2', '-3'))</span> |
| <span class="sd"> array([ 1., 2., -3.])</span> |
| |
| <span class="sd"> >>> ps.to_numeric(np.array(['1.0', '2', '-3']))</span> |
| <span class="sd"> array([ 1., 2., -3.])</span> |
| |
| <span class="sd"> >>> ps.to_numeric('1.0')</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">"coerce"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"float"</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">"raise"</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">scol_casted</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"float"</span><span class="p">)</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">assert_true</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">|</span> <span class="n">scol_casted</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">())</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">scol_casted</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">"ignore"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"'ignore' is not implemented yet, when the `arg` is Series."</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"invalid error value specified"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.broadcast.html#pyspark.pandas.broadcast">[docs]</a><span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Marks a DataFrame as small enough for use in broadcast joins.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`DataFrame.spark.hint` instead.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> obj : DataFrame</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ret : DataFrame with broadcast hint.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.merge : Merge DataFrame objects with a database-style join.</span> |
| <span class="sd"> DataFrame.join : Join columns of another DataFrame.</span> |
| <span class="sd"> DataFrame.update : Modify in place using non-NA values from another DataFrame.</span> |
| <span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [1, 2, 3, 5]},</span> |
| <span class="sd"> ... columns=['lkey', 'value']).set_index('lkey')</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [5, 6, 7, 8]},</span> |
| <span class="sd"> ... columns=['rkey', 'value']).set_index('rkey')</span> |
| <span class="sd"> >>> merged = df1.merge(ps.broadcast(df2), left_index=True, right_index=True)</span> |
| <span class="sd"> >>> merged.spark.explain() # doctest: +ELLIPSIS</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ...BroadcastHashJoin...</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"`broadcast` has been deprecated and might be removed in a future version. "</span> |
| <span class="s2">"Use `DataFrame.spark.hint` with 'broadcast' for `name` parameter instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Invalid type : expected DataFrame got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">))</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="read_orc"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_orc.html#pyspark.pandas.read_orc">[docs]</a><span class="k">def</span> <span class="nf">read_orc</span><span class="p">(</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Load an ORC object from the file path, returning a DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str</span> |
| <span class="sd"> The path string storing the ORC file to be read.</span> |
| <span class="sd"> columns : list, default None</span> |
| <span class="sd"> If not None, only these columns will be read from the file.</span> |
| <span class="sd"> index_col : str or list of str, optional, default: None</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1).to_orc('%s/read_spark_io/data.orc' % path)</span> |
| <span class="sd"> >>> ps.read_orc('%s/read_spark_io/data.orc' % path, columns=['id'])</span> |
| <span class="sd"> id</span> |
| <span class="sd"> 0 0</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> ps.range(1).to_orc('%s/read_spark_io/data.orc' % path, index_col="index")</span> |
| <span class="sd"> >>> ps.read_orc('%s/read_spark_io/data.orc' % path, columns=['id'], index_col="index")</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"orc"</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psdf_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> |
| <span class="n">new_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">psdf_columns</span><span class="p">:</span> |
| <span class="n">new_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Unknown column name '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">column</span><span class="p">))</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_get_index_map</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="p">:</span> <span class="n">PySparkDataFrame</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">PySparkColumn</span><span class="p">]],</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]:</span> |
| <span class="n">index_spark_columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">PySparkColumn</span><span class="p">]]</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span> |
| <span class="n">sdf_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">sdf_columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">return</span> <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> |
| |
| |
| <span class="n">_get_dummies_default_accept_types</span> <span class="o">=</span> <span class="p">(</span><span class="n">DecimalType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">DateType</span><span class="p">)</span> |
| <span class="n">_get_dummies_acceptable_types</span> <span class="o">=</span> <span class="n">_get_dummies_default_accept_types</span> <span class="o">+</span> <span class="p">(</span> |
| <span class="n">ByteType</span><span class="p">,</span> |
| <span class="n">ShortType</span><span class="p">,</span> |
| <span class="n">IntegerType</span><span class="p">,</span> |
| <span class="n">LongType</span><span class="p">,</span> |
| <span class="n">FloatType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">TimestampType</span><span class="p">,</span> |
| <span class="n">TimestampNTZType</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">shutil</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">tempfile</span> |
| <span class="kn">import</span> <span class="nn">uuid</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.namespace</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sf"</span><span class="p">]</span> <span class="o">=</span> <span class="n">F</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.namespace tests"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="n">db_name</span> <span class="o">=</span> <span class="s2">"db</span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">"-"</span><span class="p">,</span> <span class="s2">""</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"CREATE DATABASE </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"db"</span><span class="p">]</span> <span class="o">=</span> <span class="n">db_name</span> |
| |
| <span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"path"</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span> |
| |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"DROP DATABASE IF EXISTS </span><span class="si">%s</span><span class="s2"> CASCADE"</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |