| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.sql.datasource — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/datasource';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/datasource.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/datasource.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/datasource.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.sql.datasource</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.sql.datasource</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| <span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">UserDict</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Iterator</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Type</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StructType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkNotImplementedError</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="s2">"DataSource"</span><span class="p">,</span> |
| <span class="s2">"DataSourceReader"</span><span class="p">,</span> |
| <span class="s2">"DataSourceStreamReader"</span><span class="p">,</span> |
| <span class="s2">"DataSourceWriter"</span><span class="p">,</span> |
| <span class="s2">"DataSourceRegistration"</span><span class="p">,</span> |
| <span class="s2">"InputPartition"</span><span class="p">,</span> |
| <span class="s2">"WriterCommitMessage"</span><span class="p">,</span> |
| <span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="DataSource"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.html#pyspark.sql.datasource.DataSource">[docs]</a><span class="k">class</span> <span class="nc">DataSource</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for data sources.</span> |
| |
| <span class="sd"> This class represents a custom data source that allows for reading from and/or</span> |
| <span class="sd"> writing to it. The data source provides methods to create readers and writers</span> |
| <span class="sd"> for reading and writing data, respectively. At least one of the methods</span> |
| <span class="sd"> :meth:`DataSource.reader` or :meth:`DataSource.writer` must be implemented</span> |
| <span class="sd"> by any subclass to make the data source either readable or writable (or both).</span> |
| |
| <span class="sd"> After implementing this interface, you can start to load your data source using</span> |
| <span class="sd"> ``spark.read.format(...).load()`` and save data using ``df.write.format(...).save()``.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Initializes the data source with user-provided options.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> A case-insensitive dictionary representing the options for this data source.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should not be overridden.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">options</span> <span class="o">=</span> <span class="n">options</span> |
| |
| <div class="viewcode-block" id="DataSource.name"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.name.html#pyspark.sql.datasource.DataSource.name">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">name</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a string represents the format name of this data source.</span> |
| |
| <span class="sd"> By default, it is the class name of the data source. It can be overridden to</span> |
| <span class="sd"> provide a customized short name for the data source.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def name(cls):</span> |
| <span class="sd"> ... return "my_data_source"</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">cls</span><span class="o">.</span><span class="vm">__name__</span></div> |
| |
| <div class="viewcode-block" id="DataSource.schema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.schema.html#pyspark.sql.datasource.DataSource.schema">[docs]</a> <span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the schema of the data source.</span> |
| |
| <span class="sd"> It can refer any field initialized in the :meth:`DataSource.__init__` method</span> |
| <span class="sd"> to infer the data source's schema when users do not explicitly specify it.</span> |
| <span class="sd"> This method is invoked once when calling ``spark.read.format(...).load()``</span> |
| <span class="sd"> to get the schema for a data source read operation. If this method is not</span> |
| <span class="sd"> implemented, and a user does not provide a schema when reading the data source,</span> |
| <span class="sd"> an exception will be thrown.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> schema : :class:`StructType` or str</span> |
| <span class="sd"> The schema of this data source or a DDL string represents the schema</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Returns a DDL string:</span> |
| |
| <span class="sd"> >>> def schema(self):</span> |
| <span class="sd"> ... return "a INT, b STRING"</span> |
| |
| <span class="sd"> Returns a :class:`StructType`:</span> |
| |
| <span class="sd"> >>> def schema(self):</span> |
| <span class="sd"> ... return StructType().add("a", "int").add("b", "string")</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"schema"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSource.reader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.reader.html#pyspark.sql.datasource.DataSource.reader">[docs]</a> <span class="k">def</span> <span class="nf">reader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataSourceReader"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataSourceReader` instance for reading data.</span> |
| |
| <span class="sd"> The implementation is required for readable data sources.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> The schema of the data to be read.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> reader : :class:`DataSourceReader`</span> |
| <span class="sd"> A reader instance for this data source.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"reader"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSource.writer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.writer.html#pyspark.sql.datasource.DataSource.writer">[docs]</a> <span class="k">def</span> <span class="nf">writer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">overwrite</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataSourceWriter"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataSourceWriter` instance for writing data.</span> |
| |
| <span class="sd"> The implementation is required for writable data sources.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> The schema of the data to be written.</span> |
| <span class="sd"> overwrite : bool</span> |
| <span class="sd"> A flag indicating whether to overwrite existing data when writing to the data source.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> writer : :class:`DataSourceWriter`</span> |
| <span class="sd"> A writer instance for this data source.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"writer"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">streamWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">overwrite</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataSourceStreamWriter"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataSourceStreamWriter` instance for writing data into a streaming sink.</span> |
| |
| <span class="sd"> The implementation is required for writable streaming data sources.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> The schema of the data to be written.</span> |
| <span class="sd"> overwrite : bool</span> |
| <span class="sd"> A flag indicating whether to overwrite existing data when writing current microbatch.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> writer : :class:`DataSourceStreamWriter`</span> |
| <span class="sd"> A writer instance for writing data into a streaming sink.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"streamWriter"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">simpleStreamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SimpleDataSourceStreamReader"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`SimpleDataSourceStreamReader` instance for reading data.</span> |
| |
| <span class="sd"> One of simpleStreamReader() and streamReader() must be implemented for readable streaming</span> |
| <span class="sd"> data source. Spark will check whether streamReader() is implemented, if yes, create a</span> |
| <span class="sd"> DataSourceStreamReader to read data. simpleStreamReader() will only be invoked when</span> |
| <span class="sd"> streamReader() is not implemented.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> The schema of the data to be read.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> reader : :class:`SimpleDataSourceStreamReader`</span> |
| <span class="sd"> A reader instance for this data source.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"simpleStreamReader"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataSource.streamReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.streamReader.html#pyspark.sql.datasource.DataSource.streamReader">[docs]</a> <span class="k">def</span> <span class="nf">streamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataSourceStreamReader"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataSourceStreamReader` instance for reading streaming data.</span> |
| |
| <span class="sd"> One of simpleStreamReader() and streamReader() must be implemented for readable streaming</span> |
| <span class="sd"> data source.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> The schema of the data to be read.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> reader : :class:`DataSourceStreamReader`</span> |
| <span class="sd"> A reader instance for this streaming data source.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"streamReader"</span><span class="p">},</span> |
| <span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="InputPartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.InputPartition.html#pyspark.sql.datasource.InputPartition">[docs]</a><span class="k">class</span> <span class="nc">InputPartition</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class representing an input partition returned by the `partitions()`</span> |
| <span class="sd"> method of :class:`DataSourceReader`.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This class must be picklable.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Use the default input partition implementation:</span> |
| |
| <span class="sd"> >>> def partitions(self):</span> |
| <span class="sd"> ... return [InputPartition(1)]</span> |
| |
| <span class="sd"> Subclass the input partition class:</span> |
| |
| <span class="sd"> >>> from dataclasses import dataclass</span> |
| <span class="sd"> >>> @dataclass</span> |
| <span class="sd"> ... class RangeInputPartition(InputPartition):</span> |
| <span class="sd"> ... start: int</span> |
| <span class="sd"> ... end: int</span> |
| |
| <span class="sd"> >>> def partitions(self):</span> |
| <span class="sd"> ... return [RangeInputPartition(1, 3), RangeInputPartition(4, 6)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="n">attributes</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">k</span><span class="si">}</span><span class="s2">=</span><span class="si">{</span><span class="n">v</span><span class="si">!r}</span><span class="s2">"</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">items</span><span class="p">()])</span> |
| <span class="k">return</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">attributes</span><span class="si">}</span><span class="s2">)"</span></div> |
| |
| |
| <div class="viewcode-block" id="DataSourceReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.html#pyspark.sql.datasource.DataSourceReader">[docs]</a><span class="k">class</span> <span class="nc">DataSourceReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for data source readers. Data source readers are responsible for</span> |
| <span class="sd"> outputting data from a data source.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="DataSourceReader.partitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.partitions.html#pyspark.sql.datasource.DataSourceReader.partitions">[docs]</a> <span class="k">def</span> <span class="nf">partitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Sequence</span><span class="p">[</span><span class="n">InputPartition</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an iterator of partitions for this data source.</span> |
| |
| <span class="sd"> Partitions are used to split data reading operations into parallel tasks.</span> |
| <span class="sd"> If this method returns N partitions, the query planner will create N tasks.</span> |
| <span class="sd"> Each task will execute :meth:`DataSourceReader.read` in parallel, using the respective</span> |
| <span class="sd"> partition value to read the data.</span> |
| |
| <span class="sd"> This method is called once during query planning. By default, it returns a</span> |
| <span class="sd"> single partition with the value ``None``. Subclasses can override this method</span> |
| <span class="sd"> to return multiple partitions.</span> |
| |
| <span class="sd"> It's recommended to override this method for better performance when reading</span> |
| <span class="sd"> large datasets.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sequence of :class:`InputPartition`\\s</span> |
| <span class="sd"> A sequence of partitions for this data source. Each partition value</span> |
| <span class="sd"> must be an instance of `InputPartition` or a subclass of it.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> All partition values must be picklable objects.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Returns a list of integers:</span> |
| |
| <span class="sd"> >>> def partitions(self):</span> |
| <span class="sd"> ... return [InputPartition(1), InputPartition(2), InputPartition(3)]</span> |
| |
| <span class="sd"> Returns a list of string:</span> |
| |
| <span class="sd"> >>> def partitions(self):</span> |
| <span class="sd"> ... return [InputPartition("a"), InputPartition("b"), InputPartition("c")]</span> |
| |
| <span class="sd"> Returns a list of ranges:</span> |
| |
| <span class="sd"> >>> class RangeInputPartition(InputPartition):</span> |
| <span class="sd"> ... def __init__(self, start, end):</span> |
| <span class="sd"> ... self.start = start</span> |
| <span class="sd"> ... self.end = end</span> |
| |
| <span class="sd"> >>> def partitions(self):</span> |
| <span class="sd"> ... return [RangeInputPartition(1, 3), RangeInputPartition(5, 10)]</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"partitions"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSourceReader.read"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.read.html#pyspark.sql.datasource.DataSourceReader.read">[docs]</a> <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">partition</span><span class="p">:</span> <span class="n">InputPartition</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Generates data for a given partition and returns an iterator of tuples or rows.</span> |
| |
| <span class="sd"> This method is invoked once per partition to read the data. Implementing</span> |
| <span class="sd"> this method is required for readable data sources. You can initialize any</span> |
| <span class="sd"> non-serializable resources required for reading data from the data source</span> |
| <span class="sd"> within this method.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> partition : object</span> |
| <span class="sd"> The partition to read. It must be one of the partition values returned by</span> |
| <span class="sd"> :meth:`DataSourceReader.partitions`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> iterator of tuples or :class:`Row`\\s</span> |
| <span class="sd"> An iterator of tuples or rows. Each tuple or row will be converted to a row</span> |
| <span class="sd"> in the final DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Yields a list of tuples:</span> |
| |
| <span class="sd"> >>> def read(self, partition: InputPartition):</span> |
| <span class="sd"> ... yield (partition.value, 0)</span> |
| <span class="sd"> ... yield (partition.value, 1)</span> |
| |
| <span class="sd"> Yields a list of rows:</span> |
| |
| <span class="sd"> >>> def read(self, partition: InputPartition):</span> |
| <span class="sd"> ... yield Row(partition=partition.value, value=0)</span> |
| <span class="sd"> ... yield Row(partition=partition.value, value=1)</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div></div> |
| |
| |
| <div class="viewcode-block" id="DataSourceStreamReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.html#pyspark.sql.datasource.DataSourceStreamReader">[docs]</a><span class="k">class</span> <span class="nc">DataSourceStreamReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for streaming data source readers. Data source stream readers are responsible</span> |
| <span class="sd"> for outputting data from a streaming data source.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.initialOffset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.initialOffset.html#pyspark.sql.datasource.DataSourceStreamReader.initialOffset">[docs]</a> <span class="k">def</span> <span class="nf">initialOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the initial offset of the streaming data source.</span> |
| <span class="sd"> A new streaming query starts reading data from the initial offset.</span> |
| <span class="sd"> If Spark is restarting an existing query, it will restart from the check-pointed offset</span> |
| <span class="sd"> rather than the initial one.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dict</span> |
| <span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span> |
| <span class="sd"> Integer, String and Boolean.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def initialOffset(self):</span> |
| <span class="sd"> ... return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"initialOffset"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.latestOffset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.latestOffset.html#pyspark.sql.datasource.DataSourceStreamReader.latestOffset">[docs]</a> <span class="k">def</span> <span class="nf">latestOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the most recent offset available.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dict</span> |
| <span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span> |
| <span class="sd"> Integer, String and Boolean.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def latestOffset(self):</span> |
| <span class="sd"> ... return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"latestOffset"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.partitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.partitions.html#pyspark.sql.datasource.DataSourceStreamReader.partitions">[docs]</a> <span class="k">def</span> <span class="nf">partitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Sequence</span><span class="p">[</span><span class="n">InputPartition</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a list of InputPartition given the start and end offsets. Each InputPartition</span> |
| <span class="sd"> represents a data split that can be processed by one Spark task. This may be called with</span> |
| <span class="sd"> an empty offset range when start == end, in that case the method should return</span> |
| <span class="sd"> an empty sequence of InputPartition.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : dict</span> |
| <span class="sd"> The start offset of the microbatch to plan partitioning.</span> |
| <span class="sd"> end : dict</span> |
| <span class="sd"> The end offset of the microbatch to plan partitioning.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sequence of :class:`InputPartition`\\s</span> |
| <span class="sd"> A sequence of partitions for this data source. Each partition value</span> |
| <span class="sd"> must be an instance of `InputPartition` or a subclass of it.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"partitions"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.read"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.read.html#pyspark.sql.datasource.DataSourceStreamReader.read">[docs]</a> <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">partition</span><span class="p">:</span> <span class="n">InputPartition</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Generates data for a given partition and returns an iterator of tuples or rows.</span> |
| |
| <span class="sd"> This method is invoked once per partition to read the data. Implementing</span> |
| <span class="sd"> this method is required for stream reader. You can initialize any</span> |
| <span class="sd"> non-serializable resources required for reading data from the data source</span> |
| <span class="sd"> within this method.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method is static and stateless. You shouldn't access mutable class member</span> |
| <span class="sd"> or keep in memory state between different invocations of read().</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> partition : :class:`InputPartition`</span> |
| <span class="sd"> The partition to read. It must be one of the partition values returned by</span> |
| <span class="sd"> :meth:`DataSourceStreamReader.partitions`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> iterator of tuples or :class:`Row`\\s</span> |
| <span class="sd"> An iterator of tuples or rows. Each tuple or row will be converted to a row</span> |
| <span class="sd"> in the final DataFrame.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"read"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.commit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.commit.html#pyspark.sql.datasource.DataSourceStreamReader.commit">[docs]</a> <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Informs the source that Spark has completed processing all data for offsets less than or</span> |
| <span class="sd"> equal to `end` and will only request offsets greater than `end` in the future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> end : dict</span> |
| <span class="sd"> The latest offset that the streaming query has processed for this source.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataSourceStreamReader.stop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.stop.html#pyspark.sql.datasource.DataSourceStreamReader.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Stop this source and free any resources it has allocated.</span> |
| <span class="sd"> Invoked when the streaming query terminated.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">SimpleDataSourceStreamReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for simplified streaming data source readers.</span> |
| <span class="sd"> Compared to :class:`DataSourceStreamReader`, :class:`SimpleDataSourceStreamReader` doesn't</span> |
| <span class="sd"> require planning data partition. Also, the read api of :class:`SimpleDataSourceStreamReader`</span> |
| <span class="sd"> allows reading data and planning the latest offset at the same time.</span> |
| |
| <span class="sd"> Because :class:`SimpleDataSourceStreamReader` read records in Spark driver node to determine</span> |
| <span class="sd"> end offset of each batch without partitioning, it is only supposed to be used in</span> |
| <span class="sd"> lightweight use cases where input rate and batch size is small.</span> |
| <span class="sd"> Use :class:`DataSourceStreamReader` when read throughput is high and can't be handled</span> |
| <span class="sd"> by a single process.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">initialOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the initial offset of the streaming data source.</span> |
| <span class="sd"> A new streaming query starts reading data from the initial offset.</span> |
| <span class="sd"> If Spark is restarting an existing query, it will restart from the check-pointed offset</span> |
| <span class="sd"> rather than the initial one.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dict</span> |
| <span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span> |
| <span class="sd"> Integer, String and Boolean.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> def initialOffset(self):</span> |
| <span class="sd"> ... return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"initialOffset"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="nb">dict</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read all available data from start offset and return the offset that next read attempt</span> |
| <span class="sd"> starts from.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : dict</span> |
| <span class="sd"> The start offset to start reading from.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> A :class:`Tuple` of an iterator of :class:`Tuple` and a dict\\s</span> |
| <span class="sd"> The iterator contains all the available records after start offset.</span> |
| <span class="sd"> The dict is the end offset of this read attempt and the start of next read attempt.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"read"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">readBetweenOffsets</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Read all available data from specific start offset and end offset.</span> |
| <span class="sd"> This is invoked during failure recovery to re-read a batch deterministically.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : dict</span> |
| <span class="sd"> The start offset to start reading from.</span> |
| |
| <span class="sd"> end : dict</span> |
| <span class="sd"> The offset where the reading stop.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> iterator of :class:`Tuple`\\s</span> |
| <span class="sd"> All the records between start offset and end offset.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_IMPLEMENTED"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"readBetweenOffsets"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Informs the source that Spark has completed processing all data for offsets less than or</span> |
| <span class="sd"> equal to `end` and will only request offsets greater than `end` in the future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> end : dict</span> |
| <span class="sd"> The latest offset that the streaming query has processed for this source.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="DataSourceWriter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.html#pyspark.sql.datasource.DataSourceWriter">[docs]</a><span class="k">class</span> <span class="nc">DataSourceWriter</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for data source writers. Data source writers are responsible for saving</span> |
| <span class="sd"> the data to the data source.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="DataSourceWriter.write"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.write.html#pyspark.sql.datasource.DataSourceWriter.write">[docs]</a> <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"WriterCommitMessage"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Writes data into the data source.</span> |
| |
| <span class="sd"> This method is called once on each executor to write data to the data source.</span> |
| <span class="sd"> It accepts an iterator of input data and returns a single row representing a</span> |
| <span class="sd"> commit message, or None if there is no commit message.</span> |
| |
| <span class="sd"> The driver collects commit messages, if any, from all executors and passes them</span> |
| <span class="sd"> to the :class:`DataSourceWriter.commit` method if all tasks run successfully. If any</span> |
| <span class="sd"> task fails, the :class:`DataSourceWriter.abort` method will be called with the</span> |
| <span class="sd"> collected commit messages.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> iterator : iterator of :class:`Row`\\s</span> |
| <span class="sd"> An iterator of input data.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`WriterCommitMessage`</span> |
| <span class="sd"> a serializable commit message</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataSourceWriter.commit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.commit.html#pyspark.sql.datasource.DataSourceWriter.commit">[docs]</a> <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"WriterCommitMessage"</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Commits this writing job with a list of commit messages.</span> |
| |
| <span class="sd"> This method is invoked on the driver when all tasks run successfully. The</span> |
| <span class="sd"> commit messages are collected from the :meth:`DataSourceWriter.write` method call</span> |
| <span class="sd"> from each task, and are passed to this method. The implementation should use the</span> |
| <span class="sd"> commit messages to commit the writing job to the data source.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> messages : list of :class:`WriterCommitMessage`\\s</span> |
| <span class="sd"> A list of commit messages.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataSourceWriter.abort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.abort.html#pyspark.sql.datasource.DataSourceWriter.abort">[docs]</a> <span class="k">def</span> <span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"WriterCommitMessage"</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aborts this writing job due to task failures.</span> |
| |
| <span class="sd"> This method is invoked on the driver when one or more tasks failed. The commit</span> |
| <span class="sd"> messages are collected from the :meth:`DataSourceWriter.write` method call from</span> |
| <span class="sd"> each task, and are passed to this method. The implementation should use the</span> |
| <span class="sd"> commit messages to abort the writing job to the data source.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> messages : list of :class:`WriterCommitMessage`\\s</span> |
| <span class="sd"> A list of commit messages.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">DataSourceStreamWriter</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A base class for data stream writers. Data stream writers are responsible for writing</span> |
| <span class="sd"> the data to the streaming sink.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"WriterCommitMessage"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Writes data into the streaming sink.</span> |
| |
| <span class="sd"> This method is called on executors to write data to the streaming data sink in</span> |
| <span class="sd"> each microbatch. It accepts an iterator of input data and returns a single row</span> |
| <span class="sd"> representing a commit message, or None if there is no commit message.</span> |
| |
| <span class="sd"> The driver collects commit messages, if any, from all executors and passes them</span> |
| <span class="sd"> to the ``commit`` method if all tasks run successfully. If any task fails, the</span> |
| <span class="sd"> ``abort`` method will be called with the collected commit messages.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> iterator : Iterator[Row]</span> |
| <span class="sd"> An iterator of input data.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> WriterCommitMessage : a serializable commit message</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"WriterCommitMessage"</span><span class="p">],</span> <span class="n">batchId</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Commits this microbatch with a list of commit messages.</span> |
| |
| <span class="sd"> This method is invoked on the driver when all tasks run successfully. The</span> |
| <span class="sd"> commit messages are collected from the ``write`` method call from each task,</span> |
| <span class="sd"> and are passed to this method. The implementation should use the commit messages</span> |
| <span class="sd"> to commit the microbatch in the streaming sink.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> messages : List[WriterCommitMessage]</span> |
| <span class="sd"> A list of commit messages.</span> |
| <span class="sd"> batchId: int</span> |
| <span class="sd"> An integer that uniquely identifies a batch of data being written.</span> |
| <span class="sd"> The integer increase by 1 with each microbatch processed.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="k">def</span> <span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"WriterCommitMessage"</span><span class="p">],</span> <span class="n">batchId</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aborts this microbatch due to task failures.</span> |
| |
| <span class="sd"> This method is invoked on the driver when one or more tasks failed. The commit</span> |
| <span class="sd"> messages are collected from the ``write`` method call from each task, and are</span> |
| <span class="sd"> passed to this method. The implementation should use the commit messages to</span> |
| <span class="sd"> abort the microbatch in the streaming sink.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> messages : List[WriterCommitMessage]</span> |
| <span class="sd"> A list of commit messages.</span> |
| <span class="sd"> batchId: int</span> |
| <span class="sd"> An integer that uniquely identifies a batch of data being written.</span> |
| <span class="sd"> The integer increase by 1 with each microbatch processed.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="WriterCommitMessage"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.WriterCommitMessage.html#pyspark.sql.datasource.WriterCommitMessage">[docs]</a><span class="k">class</span> <span class="nc">WriterCommitMessage</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A commit message returned by the :meth:`DataSourceWriter.write` and will be</span> |
| <span class="sd"> sent back to the driver side as input parameter of :meth:`DataSourceWriter.commit`</span> |
| <span class="sd"> or :meth:`DataSourceWriter.abort` method.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This class must be picklable.</span> |
| <span class="sd"> """</span> |
| |
| <span class="o">...</span></div> |
| |
| |
| <div class="viewcode-block" id="DataSourceRegistration"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceRegistration.html#pyspark.sql.datasource.DataSourceRegistration">[docs]</a><span class="k">class</span> <span class="nc">DataSourceRegistration</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Wrapper for data source registration. This instance can be accessed by</span> |
| <span class="sd"> :attr:`spark.dataSource`.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sparkSession</span><span class="p">:</span> <span class="s2">"SparkSession"</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span> <span class="o">=</span> <span class="n">sparkSession</span> |
| |
| <div class="viewcode-block" id="DataSourceRegistration.register"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceRegistration.register.html#pyspark.sql.datasource.DataSourceRegistration.register">[docs]</a> <span class="k">def</span> <span class="nf">register</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">dataSource</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="s2">"DataSource"</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Register a Python user-defined data source.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dataSource : type</span> |
| <span class="sd"> The data source class to be registered. It should be a subclass of DataSource.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">_wrap_function</span> |
| |
| <span class="n">name</span> <span class="o">=</span> <span class="n">dataSource</span><span class="o">.</span><span class="n">name</span><span class="p">()</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="o">.</span><span class="n">sparkContext</span> |
| <span class="c1"># Serialize the data source class.</span> |
| <span class="n">wrapped</span> <span class="o">=</span> <span class="n">_wrap_function</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">dataSource</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">jvm</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> |
| <span class="n">ds</span> <span class="o">=</span> <span class="n">jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">execution</span><span class="o">.</span><span class="n">datasources</span><span class="o">.</span><span class="n">v2</span><span class="o">.</span><span class="n">python</span><span class="o">.</span><span class="n">UserDefinedPythonDataSource</span><span class="p">(</span> |
| <span class="n">wrapped</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">dataSource</span><span class="p">()</span><span class="o">.</span><span class="n">registerPython</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">ds</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">CaseInsensitiveDict</span><span class="p">(</span><span class="n">UserDict</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A case-insensitive map of string keys to values.</span> |
| |
| <span class="sd"> This is used by Python data source options to ensure consistent case insensitivity.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__setitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__setitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">(),</span> <span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span> |
| |
| <span class="k">def</span> <span class="fm">__delitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__delitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span> |
| |
| <span class="k">def</span> <span class="fm">__contains__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">object</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__contains__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span> |
| <span class="k">return</span> <span class="kc">False</span> |
| |
| <span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">dict</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="bp">self</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span> |
| |
| <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CaseInsensitiveDict"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)(</span><span class="bp">self</span><span class="p">)</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |