blob: f43ac1ae015cb61d3ac399cb392747a46dfaa5d4 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.sql.datasource &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/datasource';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/datasource.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/datasource.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/datasource.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.sql.datasource</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.sql.datasource</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">UserDict</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Iterator</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Type</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StructType</span>
<span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkNotImplementedError</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.session</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;DataSource&quot;</span><span class="p">,</span>
<span class="s2">&quot;DataSourceReader&quot;</span><span class="p">,</span>
<span class="s2">&quot;DataSourceStreamReader&quot;</span><span class="p">,</span>
<span class="s2">&quot;DataSourceWriter&quot;</span><span class="p">,</span>
<span class="s2">&quot;DataSourceRegistration&quot;</span><span class="p">,</span>
<span class="s2">&quot;InputPartition&quot;</span><span class="p">,</span>
<span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">,</span>
<span class="p">]</span>
<div class="viewcode-block" id="DataSource"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.html#pyspark.sql.datasource.DataSource">[docs]</a><span class="k">class</span> <span class="nc">DataSource</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for data sources.</span>
<span class="sd"> This class represents a custom data source that allows for reading from and/or</span>
<span class="sd"> writing to it. The data source provides methods to create readers and writers</span>
<span class="sd"> for reading and writing data, respectively. At least one of the methods</span>
<span class="sd"> :meth:`DataSource.reader` or :meth:`DataSource.writer` must be implemented</span>
<span class="sd"> by any subclass to make the data source either readable or writable (or both).</span>
<span class="sd"> After implementing this interface, you can start to load your data source using</span>
<span class="sd"> ``spark.read.format(...).load()`` and save data using ``df.write.format(...).save()``.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initializes the data source with user-provided options.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> options : dict</span>
<span class="sd"> A case-insensitive dictionary representing the options for this data source.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method should not be overridden.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">options</span> <span class="o">=</span> <span class="n">options</span>
<div class="viewcode-block" id="DataSource.name"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.name.html#pyspark.sql.datasource.DataSource.name">[docs]</a> <span class="nd">@classmethod</span>
<span class="k">def</span> <span class="nf">name</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a string represents the format name of this data source.</span>
<span class="sd"> By default, it is the class name of the data source. It can be overridden to</span>
<span class="sd"> provide a customized short name for the data source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def name(cls):</span>
<span class="sd"> ... return &quot;my_data_source&quot;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">cls</span><span class="o">.</span><span class="vm">__name__</span></div>
<div class="viewcode-block" id="DataSource.schema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.schema.html#pyspark.sql.datasource.DataSource.schema">[docs]</a> <span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the schema of the data source.</span>
<span class="sd"> It can refer any field initialized in the :meth:`DataSource.__init__` method</span>
<span class="sd"> to infer the data source&#39;s schema when users do not explicitly specify it.</span>
<span class="sd"> This method is invoked once when calling ``spark.read.format(...).load()``</span>
<span class="sd"> to get the schema for a data source read operation. If this method is not</span>
<span class="sd"> implemented, and a user does not provide a schema when reading the data source,</span>
<span class="sd"> an exception will be thrown.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> schema : :class:`StructType` or str</span>
<span class="sd"> The schema of this data source or a DDL string represents the schema</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Returns a DDL string:</span>
<span class="sd"> &gt;&gt;&gt; def schema(self):</span>
<span class="sd"> ... return &quot;a INT, b STRING&quot;</span>
<span class="sd"> Returns a :class:`StructType`:</span>
<span class="sd"> &gt;&gt;&gt; def schema(self):</span>
<span class="sd"> ... return StructType().add(&quot;a&quot;, &quot;int&quot;).add(&quot;b&quot;, &quot;string&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;schema&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSource.reader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.reader.html#pyspark.sql.datasource.DataSource.reader">[docs]</a> <span class="k">def</span> <span class="nf">reader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataSourceReader&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataSourceReader` instance for reading data.</span>
<span class="sd"> The implementation is required for readable data sources.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> The schema of the data to be read.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> reader : :class:`DataSourceReader`</span>
<span class="sd"> A reader instance for this data source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;reader&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSource.writer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.writer.html#pyspark.sql.datasource.DataSource.writer">[docs]</a> <span class="k">def</span> <span class="nf">writer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">overwrite</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataSourceWriter&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataSourceWriter` instance for writing data.</span>
<span class="sd"> The implementation is required for writable data sources.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> The schema of the data to be written.</span>
<span class="sd"> overwrite : bool</span>
<span class="sd"> A flag indicating whether to overwrite existing data when writing to the data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> writer : :class:`DataSourceWriter`</span>
<span class="sd"> A writer instance for this data source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;writer&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<span class="k">def</span> <span class="nf">streamWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">overwrite</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataSourceStreamWriter&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataSourceStreamWriter` instance for writing data into a streaming sink.</span>
<span class="sd"> The implementation is required for writable streaming data sources.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> The schema of the data to be written.</span>
<span class="sd"> overwrite : bool</span>
<span class="sd"> A flag indicating whether to overwrite existing data when writing current microbatch.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> writer : :class:`DataSourceStreamWriter`</span>
<span class="sd"> A writer instance for writing data into a streaming sink.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;streamWriter&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">simpleStreamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SimpleDataSourceStreamReader&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`SimpleDataSourceStreamReader` instance for reading data.</span>
<span class="sd"> One of simpleStreamReader() and streamReader() must be implemented for readable streaming</span>
<span class="sd"> data source. Spark will check whether streamReader() is implemented, if yes, create a</span>
<span class="sd"> DataSourceStreamReader to read data. simpleStreamReader() will only be invoked when</span>
<span class="sd"> streamReader() is not implemented.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> The schema of the data to be read.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> reader : :class:`SimpleDataSourceStreamReader`</span>
<span class="sd"> A reader instance for this data source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;simpleStreamReader&quot;</span><span class="p">},</span>
<span class="p">)</span>
<div class="viewcode-block" id="DataSource.streamReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSource.streamReader.html#pyspark.sql.datasource.DataSource.streamReader">[docs]</a> <span class="k">def</span> <span class="nf">streamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataSourceStreamReader&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataSourceStreamReader` instance for reading streaming data.</span>
<span class="sd"> One of simpleStreamReader() and streamReader() must be implemented for readable streaming</span>
<span class="sd"> data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> The schema of the data to be read.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> reader : :class:`DataSourceStreamReader`</span>
<span class="sd"> A reader instance for this streaming data source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;streamReader&quot;</span><span class="p">},</span>
<span class="p">)</span></div></div>
<div class="viewcode-block" id="InputPartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.InputPartition.html#pyspark.sql.datasource.InputPartition">[docs]</a><span class="k">class</span> <span class="nc">InputPartition</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class representing an input partition returned by the `partitions()`</span>
<span class="sd"> method of :class:`DataSourceReader`.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This class must be picklable.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Use the default input partition implementation:</span>
<span class="sd"> &gt;&gt;&gt; def partitions(self):</span>
<span class="sd"> ... return [InputPartition(1)]</span>
<span class="sd"> Subclass the input partition class:</span>
<span class="sd"> &gt;&gt;&gt; from dataclasses import dataclass</span>
<span class="sd"> &gt;&gt;&gt; @dataclass</span>
<span class="sd"> ... class RangeInputPartition(InputPartition):</span>
<span class="sd"> ... start: int</span>
<span class="sd"> ... end: int</span>
<span class="sd"> &gt;&gt;&gt; def partitions(self):</span>
<span class="sd"> ... return [RangeInputPartition(1, 3), RangeInputPartition(4, 6)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="n">attributes</span> <span class="o">=</span> <span class="s2">&quot;, &quot;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">k</span><span class="si">}</span><span class="s2">=</span><span class="si">{</span><span class="n">v</span><span class="si">!r}</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">items</span><span class="p">()])</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">attributes</span><span class="si">}</span><span class="s2">)&quot;</span></div>
<div class="viewcode-block" id="DataSourceReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.html#pyspark.sql.datasource.DataSourceReader">[docs]</a><span class="k">class</span> <span class="nc">DataSourceReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for data source readers. Data source readers are responsible for</span>
<span class="sd"> outputting data from a data source.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="DataSourceReader.partitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.partitions.html#pyspark.sql.datasource.DataSourceReader.partitions">[docs]</a> <span class="k">def</span> <span class="nf">partitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">InputPartition</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an iterator of partitions for this data source.</span>
<span class="sd"> Partitions are used to split data reading operations into parallel tasks.</span>
<span class="sd"> If this method returns N partitions, the query planner will create N tasks.</span>
<span class="sd"> Each task will execute :meth:`DataSourceReader.read` in parallel, using the respective</span>
<span class="sd"> partition value to read the data.</span>
<span class="sd"> This method is called once during query planning. By default, it returns a</span>
<span class="sd"> single partition with the value ``None``. Subclasses can override this method</span>
<span class="sd"> to return multiple partitions.</span>
<span class="sd"> It&#39;s recommended to override this method for better performance when reading</span>
<span class="sd"> large datasets.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> sequence of :class:`InputPartition`\\s</span>
<span class="sd"> A sequence of partitions for this data source. Each partition value</span>
<span class="sd"> must be an instance of `InputPartition` or a subclass of it.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> All partition values must be picklable objects.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Returns a list of integers:</span>
<span class="sd"> &gt;&gt;&gt; def partitions(self):</span>
<span class="sd"> ... return [InputPartition(1), InputPartition(2), InputPartition(3)]</span>
<span class="sd"> Returns a list of string:</span>
<span class="sd"> &gt;&gt;&gt; def partitions(self):</span>
<span class="sd"> ... return [InputPartition(&quot;a&quot;), InputPartition(&quot;b&quot;), InputPartition(&quot;c&quot;)]</span>
<span class="sd"> Returns a list of ranges:</span>
<span class="sd"> &gt;&gt;&gt; class RangeInputPartition(InputPartition):</span>
<span class="sd"> ... def __init__(self, start, end):</span>
<span class="sd"> ... self.start = start</span>
<span class="sd"> ... self.end = end</span>
<span class="sd"> &gt;&gt;&gt; def partitions(self):</span>
<span class="sd"> ... return [RangeInputPartition(1, 3), RangeInputPartition(5, 10)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;partitions&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSourceReader.read"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceReader.read.html#pyspark.sql.datasource.DataSourceReader.read">[docs]</a> <span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">partition</span><span class="p">:</span> <span class="n">InputPartition</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generates data for a given partition and returns an iterator of tuples or rows.</span>
<span class="sd"> This method is invoked once per partition to read the data. Implementing</span>
<span class="sd"> this method is required for readable data sources. You can initialize any</span>
<span class="sd"> non-serializable resources required for reading data from the data source</span>
<span class="sd"> within this method.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> partition : object</span>
<span class="sd"> The partition to read. It must be one of the partition values returned by</span>
<span class="sd"> :meth:`DataSourceReader.partitions`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> iterator of tuples or :class:`Row`\\s</span>
<span class="sd"> An iterator of tuples or rows. Each tuple or row will be converted to a row</span>
<span class="sd"> in the final DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Yields a list of tuples:</span>
<span class="sd"> &gt;&gt;&gt; def read(self, partition: InputPartition):</span>
<span class="sd"> ... yield (partition.value, 0)</span>
<span class="sd"> ... yield (partition.value, 1)</span>
<span class="sd"> Yields a list of rows:</span>
<span class="sd"> &gt;&gt;&gt; def read(self, partition: InputPartition):</span>
<span class="sd"> ... yield Row(partition=partition.value, value=0)</span>
<span class="sd"> ... yield Row(partition=partition.value, value=1)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div></div>
<div class="viewcode-block" id="DataSourceStreamReader"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.html#pyspark.sql.datasource.DataSourceStreamReader">[docs]</a><span class="k">class</span> <span class="nc">DataSourceStreamReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for streaming data source readers. Data source stream readers are responsible</span>
<span class="sd"> for outputting data from a streaming data source.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="DataSourceStreamReader.initialOffset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.initialOffset.html#pyspark.sql.datasource.DataSourceStreamReader.initialOffset">[docs]</a> <span class="k">def</span> <span class="nf">initialOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the initial offset of the streaming data source.</span>
<span class="sd"> A new streaming query starts reading data from the initial offset.</span>
<span class="sd"> If Spark is restarting an existing query, it will restart from the check-pointed offset</span>
<span class="sd"> rather than the initial one.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dict</span>
<span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span>
<span class="sd"> Integer, String and Boolean.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def initialOffset(self):</span>
<span class="sd"> ... return {&quot;parititon-1&quot;: {&quot;index&quot;: 3, &quot;closed&quot;: True}, &quot;partition-2&quot;: {&quot;index&quot;: 5}}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;initialOffset&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSourceStreamReader.latestOffset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.latestOffset.html#pyspark.sql.datasource.DataSourceStreamReader.latestOffset">[docs]</a> <span class="k">def</span> <span class="nf">latestOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the most recent offset available.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dict</span>
<span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span>
<span class="sd"> Integer, String and Boolean.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def latestOffset(self):</span>
<span class="sd"> ... return {&quot;parititon-1&quot;: {&quot;index&quot;: 3, &quot;closed&quot;: True}, &quot;partition-2&quot;: {&quot;index&quot;: 5}}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;latestOffset&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSourceStreamReader.partitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.partitions.html#pyspark.sql.datasource.DataSourceStreamReader.partitions">[docs]</a> <span class="k">def</span> <span class="nf">partitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">InputPartition</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a list of InputPartition given the start and end offsets. Each InputPartition</span>
<span class="sd"> represents a data split that can be processed by one Spark task. This may be called with</span>
<span class="sd"> an empty offset range when start == end, in that case the method should return</span>
<span class="sd"> an empty sequence of InputPartition.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : dict</span>
<span class="sd"> The start offset of the microbatch to plan partitioning.</span>
<span class="sd"> end : dict</span>
<span class="sd"> The end offset of the microbatch to plan partitioning.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> sequence of :class:`InputPartition`\\s</span>
<span class="sd"> A sequence of partitions for this data source. Each partition value</span>
<span class="sd"> must be an instance of `InputPartition` or a subclass of it.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;partitions&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSourceStreamReader.read"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.read.html#pyspark.sql.datasource.DataSourceStreamReader.read">[docs]</a> <span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">partition</span><span class="p">:</span> <span class="n">InputPartition</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generates data for a given partition and returns an iterator of tuples or rows.</span>
<span class="sd"> This method is invoked once per partition to read the data. Implementing</span>
<span class="sd"> this method is required for stream reader. You can initialize any</span>
<span class="sd"> non-serializable resources required for reading data from the data source</span>
<span class="sd"> within this method.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method is static and stateless. You shouldn&#39;t access mutable class member</span>
<span class="sd"> or keep in memory state between different invocations of read().</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> partition : :class:`InputPartition`</span>
<span class="sd"> The partition to read. It must be one of the partition values returned by</span>
<span class="sd"> :meth:`DataSourceStreamReader.partitions`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> iterator of tuples or :class:`Row`\\s</span>
<span class="sd"> An iterator of tuples or rows. Each tuple or row will be converted to a row</span>
<span class="sd"> in the final DataFrame.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;read&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="DataSourceStreamReader.commit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.commit.html#pyspark.sql.datasource.DataSourceStreamReader.commit">[docs]</a> <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Informs the source that Spark has completed processing all data for offsets less than or</span>
<span class="sd"> equal to `end` and will only request offsets greater than `end` in the future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> end : dict</span>
<span class="sd"> The latest offset that the streaming query has processed for this source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataSourceStreamReader.stop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceStreamReader.stop.html#pyspark.sql.datasource.DataSourceStreamReader.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Stop this source and free any resources it has allocated.</span>
<span class="sd"> Invoked when the streaming query terminated.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div></div>
<span class="k">class</span> <span class="nc">SimpleDataSourceStreamReader</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for simplified streaming data source readers.</span>
<span class="sd"> Compared to :class:`DataSourceStreamReader`, :class:`SimpleDataSourceStreamReader` doesn&#39;t</span>
<span class="sd"> require planning data partition. Also, the read api of :class:`SimpleDataSourceStreamReader`</span>
<span class="sd"> allows reading data and planning the latest offset at the same time.</span>
<span class="sd"> Because :class:`SimpleDataSourceStreamReader` read records in Spark driver node to determine</span>
<span class="sd"> end offset of each batch without partitioning, it is only supposed to be used in</span>
<span class="sd"> lightweight use cases where input rate and batch size is small.</span>
<span class="sd"> Use :class:`DataSourceStreamReader` when read throughput is high and can&#39;t be handled</span>
<span class="sd"> by a single process.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">initialOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the initial offset of the streaming data source.</span>
<span class="sd"> A new streaming query starts reading data from the initial offset.</span>
<span class="sd"> If Spark is restarting an existing query, it will restart from the check-pointed offset</span>
<span class="sd"> rather than the initial one.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dict</span>
<span class="sd"> A dict or recursive dict whose key and value are primitive types, which includes</span>
<span class="sd"> Integer, String and Boolean.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def initialOffset(self):</span>
<span class="sd"> ... return {&quot;parititon-1&quot;: {&quot;index&quot;: 3, &quot;closed&quot;: True}, &quot;partition-2&quot;: {&quot;index&quot;: 5}}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;initialOffset&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">],</span> <span class="nb">dict</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read all available data from start offset and return the offset that next read attempt</span>
<span class="sd"> starts from.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : dict</span>
<span class="sd"> The start offset to start reading from.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> A :class:`Tuple` of an iterator of :class:`Tuple` and a dict\\s</span>
<span class="sd"> The iterator contains all the available records after start offset.</span>
<span class="sd"> The dict is the end offset of this read attempt and the start of next read attempt.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;read&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">readBetweenOffsets</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read all available data from specific start offset and end offset.</span>
<span class="sd"> This is invoked during failure recovery to re-read a batch deterministically.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : dict</span>
<span class="sd"> The start offset to start reading from.</span>
<span class="sd"> end : dict</span>
<span class="sd"> The offset where the reading stop.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> iterator of :class:`Tuple`\\s</span>
<span class="sd"> All the records between start offset and end offset.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkNotImplementedError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_IMPLEMENTED&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;readBetweenOffsets&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Informs the source that Spark has completed processing all data for offsets less than or</span>
<span class="sd"> equal to `end` and will only request offsets greater than `end` in the future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> end : dict</span>
<span class="sd"> The latest offset that the streaming query has processed for this source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataSourceWriter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.html#pyspark.sql.datasource.DataSourceWriter">[docs]</a><span class="k">class</span> <span class="nc">DataSourceWriter</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for data source writers. Data source writers are responsible for saving</span>
<span class="sd"> the data to the data source.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="DataSourceWriter.write"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.write.html#pyspark.sql.datasource.DataSourceWriter.write">[docs]</a> <span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Writes data into the data source.</span>
<span class="sd"> This method is called once on each executor to write data to the data source.</span>
<span class="sd"> It accepts an iterator of input data and returns a single row representing a</span>
<span class="sd"> commit message, or None if there is no commit message.</span>
<span class="sd"> The driver collects commit messages, if any, from all executors and passes them</span>
<span class="sd"> to the :class:`DataSourceWriter.commit` method if all tasks run successfully. If any</span>
<span class="sd"> task fails, the :class:`DataSourceWriter.abort` method will be called with the</span>
<span class="sd"> collected commit messages.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> iterator : iterator of :class:`Row`\\s</span>
<span class="sd"> An iterator of input data.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`WriterCommitMessage`</span>
<span class="sd"> a serializable commit message</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataSourceWriter.commit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.commit.html#pyspark.sql.datasource.DataSourceWriter.commit">[docs]</a> <span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Commits this writing job with a list of commit messages.</span>
<span class="sd"> This method is invoked on the driver when all tasks run successfully. The</span>
<span class="sd"> commit messages are collected from the :meth:`DataSourceWriter.write` method call</span>
<span class="sd"> from each task, and are passed to this method. The implementation should use the</span>
<span class="sd"> commit messages to commit the writing job to the data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> messages : list of :class:`WriterCommitMessage`\\s</span>
<span class="sd"> A list of commit messages.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataSourceWriter.abort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceWriter.abort.html#pyspark.sql.datasource.DataSourceWriter.abort">[docs]</a> <span class="k">def</span> <span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aborts this writing job due to task failures.</span>
<span class="sd"> This method is invoked on the driver when one or more tasks failed. The commit</span>
<span class="sd"> messages are collected from the :meth:`DataSourceWriter.write` method call from</span>
<span class="sd"> each task, and are passed to this method. The implementation should use the</span>
<span class="sd"> commit messages to abort the writing job to the data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> messages : list of :class:`WriterCommitMessage`\\s</span>
<span class="sd"> A list of commit messages.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div></div>
<span class="k">class</span> <span class="nc">DataSourceStreamWriter</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A base class for data stream writers. Data stream writers are responsible for writing</span>
<span class="sd"> the data to the streaming sink.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Writes data into the streaming sink.</span>
<span class="sd"> This method is called on executors to write data to the streaming data sink in</span>
<span class="sd"> each microbatch. It accepts an iterator of input data and returns a single row</span>
<span class="sd"> representing a commit message, or None if there is no commit message.</span>
<span class="sd"> The driver collects commit messages, if any, from all executors and passes them</span>
<span class="sd"> to the ``commit`` method if all tasks run successfully. If any task fails, the</span>
<span class="sd"> ``abort`` method will be called with the collected commit messages.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> iterator : Iterator[Row]</span>
<span class="sd"> An iterator of input data.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> WriterCommitMessage : a serializable commit message</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">commit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">],</span> <span class="n">batchId</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Commits this microbatch with a list of commit messages.</span>
<span class="sd"> This method is invoked on the driver when all tasks run successfully. The</span>
<span class="sd"> commit messages are collected from the ``write`` method call from each task,</span>
<span class="sd"> and are passed to this method. The implementation should use the commit messages</span>
<span class="sd"> to commit the microbatch in the streaming sink.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> messages : List[WriterCommitMessage]</span>
<span class="sd"> A list of commit messages.</span>
<span class="sd"> batchId: int</span>
<span class="sd"> An integer that uniquely identifies a batch of data being written.</span>
<span class="sd"> The integer increase by 1 with each microbatch processed.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">messages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;WriterCommitMessage&quot;</span><span class="p">],</span> <span class="n">batchId</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aborts this microbatch due to task failures.</span>
<span class="sd"> This method is invoked on the driver when one or more tasks failed. The commit</span>
<span class="sd"> messages are collected from the ``write`` method call from each task, and are</span>
<span class="sd"> passed to this method. The implementation should use the commit messages to</span>
<span class="sd"> abort the microbatch in the streaming sink.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> messages : List[WriterCommitMessage]</span>
<span class="sd"> A list of commit messages.</span>
<span class="sd"> batchId: int</span>
<span class="sd"> An integer that uniquely identifies a batch of data being written.</span>
<span class="sd"> The integer increase by 1 with each microbatch processed.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="WriterCommitMessage"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.WriterCommitMessage.html#pyspark.sql.datasource.WriterCommitMessage">[docs]</a><span class="k">class</span> <span class="nc">WriterCommitMessage</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A commit message returned by the :meth:`DataSourceWriter.write` and will be</span>
<span class="sd"> sent back to the driver side as input parameter of :meth:`DataSourceWriter.commit`</span>
<span class="sd"> or :meth:`DataSourceWriter.abort` method.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This class must be picklable.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataSourceRegistration"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceRegistration.html#pyspark.sql.datasource.DataSourceRegistration">[docs]</a><span class="k">class</span> <span class="nc">DataSourceRegistration</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Wrapper for data source registration. This instance can be accessed by</span>
<span class="sd"> :attr:`spark.dataSource`.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sparkSession</span><span class="p">:</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span> <span class="o">=</span> <span class="n">sparkSession</span>
<div class="viewcode-block" id="DataSourceRegistration.register"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.datasource.DataSourceRegistration.register.html#pyspark.sql.datasource.DataSourceRegistration.register">[docs]</a> <span class="k">def</span> <span class="nf">register</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">dataSource</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="s2">&quot;DataSource&quot;</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Register a Python user-defined data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dataSource : type</span>
<span class="sd"> The data source class to be registered. It should be a subclass of DataSource.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">_wrap_function</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">dataSource</span><span class="o">.</span><span class="n">name</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="c1"># Serialize the data source class.</span>
<span class="n">wrapped</span> <span class="o">=</span> <span class="n">_wrap_function</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">dataSource</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">jvm</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span>
<span class="n">ds</span> <span class="o">=</span> <span class="n">jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">execution</span><span class="o">.</span><span class="n">datasources</span><span class="o">.</span><span class="n">v2</span><span class="o">.</span><span class="n">python</span><span class="o">.</span><span class="n">UserDefinedPythonDataSource</span><span class="p">(</span>
<span class="n">wrapped</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">dataSource</span><span class="p">()</span><span class="o">.</span><span class="n">registerPython</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">ds</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">CaseInsensitiveDict</span><span class="p">(</span><span class="n">UserDict</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A case-insensitive map of string keys to values.</span>
<span class="sd"> This is used by Python data source options to ensure consistent case insensitivity.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__setitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__setitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">(),</span> <span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span>
<span class="k">def</span> <span class="fm">__delitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__delitem__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span>
<span class="k">def</span> <span class="fm">__contains__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">object</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__contains__</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">lower</span><span class="p">())</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">dict</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="bp">self</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span>
<span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CaseInsensitiveDict&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)(</span><span class="bp">self</span><span class="p">)</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>