blob: d3e22978c9b23bb0f69e2fdf190f6402c6a42e0f [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.ml.feature &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/ml/feature';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/feature.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/ml/feature.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/ml/feature.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.ml.feature</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.ml.feature</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">overload</span><span class="p">,</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Generic</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">TypeVar</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">keyword_only</span><span class="p">,</span> <span class="n">since</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">_convert_to_vector</span><span class="p">,</span> <span class="n">DenseMatrix</span><span class="p">,</span> <span class="n">DenseVector</span><span class="p">,</span> <span class="n">Vector</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">HasThreshold</span><span class="p">,</span>
<span class="n">HasThresholds</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">HasRelativeError</span><span class="p">,</span>
<span class="n">HasFeaturesCol</span><span class="p">,</span>
<span class="n">HasLabelCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">HasNumFeatures</span><span class="p">,</span>
<span class="n">HasStepSize</span><span class="p">,</span>
<span class="n">HasMaxIter</span><span class="p">,</span>
<span class="n">TypeConverters</span><span class="p">,</span>
<span class="n">Param</span><span class="p">,</span>
<span class="n">Params</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaParams</span><span class="p">,</span> <span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">_jvm</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span>
<span class="n">JM</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">&quot;JM&quot;</span><span class="p">,</span> <span class="n">bound</span><span class="o">=</span><span class="n">JavaTransformer</span><span class="p">)</span>
<span class="n">P</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">&quot;P&quot;</span><span class="p">,</span> <span class="n">bound</span><span class="o">=</span><span class="n">Params</span><span class="p">)</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;Binarizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;BucketedRandomProjectionLSH&quot;</span><span class="p">,</span>
<span class="s2">&quot;BucketedRandomProjectionLSHModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;Bucketizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;ChiSqSelector&quot;</span><span class="p">,</span>
<span class="s2">&quot;ChiSqSelectorModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;CountVectorizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;DCT&quot;</span><span class="p">,</span>
<span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">,</span>
<span class="s2">&quot;FeatureHasher&quot;</span><span class="p">,</span>
<span class="s2">&quot;HashingTF&quot;</span><span class="p">,</span>
<span class="s2">&quot;IDF&quot;</span><span class="p">,</span>
<span class="s2">&quot;IDFModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;Imputer&quot;</span><span class="p">,</span>
<span class="s2">&quot;ImputerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;IndexToString&quot;</span><span class="p">,</span>
<span class="s2">&quot;Interaction&quot;</span><span class="p">,</span>
<span class="s2">&quot;MaxAbsScaler&quot;</span><span class="p">,</span>
<span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;MinHashLSH&quot;</span><span class="p">,</span>
<span class="s2">&quot;MinHashLSHModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">,</span>
<span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;NGram&quot;</span><span class="p">,</span>
<span class="s2">&quot;Normalizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">,</span>
<span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;PCA&quot;</span><span class="p">,</span>
<span class="s2">&quot;PCAModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">,</span>
<span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;RobustScaler&quot;</span><span class="p">,</span>
<span class="s2">&quot;RobustScalerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;RFormula&quot;</span><span class="p">,</span>
<span class="s2">&quot;RFormulaModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;SQLTransformer&quot;</span><span class="p">,</span>
<span class="s2">&quot;StandardScaler&quot;</span><span class="p">,</span>
<span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">,</span>
<span class="s2">&quot;StringIndexer&quot;</span><span class="p">,</span>
<span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;Tokenizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">,</span>
<span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">,</span>
<span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;VectorAssembler&quot;</span><span class="p">,</span>
<span class="s2">&quot;VectorIndexer&quot;</span><span class="p">,</span>
<span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">,</span>
<span class="s2">&quot;VectorSlicer&quot;</span><span class="p">,</span>
<span class="s2">&quot;Word2Vec&quot;</span><span class="p">,</span>
<span class="s2">&quot;Word2VecModel&quot;</span><span class="p">,</span>
<span class="p">]</span>
<div class="viewcode-block" id="Binarizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Binarizer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasThreshold</span><span class="p">,</span>
<span class="n">HasThresholds</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Binarizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Binarize a column of continuous features given a threshold. Since 3.0.0,</span>
<span class="sd"> :py:class:`Binarize` can map multiple columns at once by setting the :py:attr:`inputCols`</span>
<span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span>
<span class="sd"> are set, an Exception will be thrown. The :py:attr:`threshold` parameter is used for</span>
<span class="sd"> single column usage, and :py:attr:`thresholds` is for multiple columns.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.5,)], [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; binarizer = Binarizer(threshold=1.0, inputCol=&quot;values&quot;, outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setThreshold(1.0)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setInputCol(&quot;values&quot;)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.transform(df).head().features</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; params = {binarizer.threshold: -0.5, binarizer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; binarizer.transform(df, params).head().vector</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; binarizerPath = temp_path + &quot;/binarizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; binarizer.save(binarizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer = Binarizer.load(binarizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer.getThreshold() == binarizer.getThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(0.5, 0.3)], [&quot;values1&quot;, &quot;values2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; binarizer2 = Binarizer(thresholds=[0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; binarizer2.setInputCols([&quot;values1&quot;, &quot;values2&quot;]).setOutputCols([&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer2.transform(df2).show()</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> |values1|values2|output1|output2|</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> | 0.5| 0.3| 1.0| 0.0|</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">threshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;threshold&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for threshold used to binarize continuous features. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;The features greater than the threshold will be binarized to 1.0. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;The features equal to or less than the threshold will be binarized to 0.0&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;thresholds&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for array of threshold used to binarize continuous features. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;This is for multiple columns input. If transforming multiple columns &quot;</span>
<span class="o">+</span> <span class="s2">&quot;and thresholds is not set, but threshold is set, then threshold will &quot;</span>
<span class="o">+</span> <span class="s2">&quot;be applied across all columns.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span>
<span class="sd"> inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Binarizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Binarizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="Binarizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span>
<span class="sd"> inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this Binarizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`threshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setThresholds"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThresholds">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setThresholds</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`thresholds`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">thresholds</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Binarizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_LSHParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH) algorithm parameters.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">numHashTables</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;numHashTables&quot;</span><span class="p">,</span>
<span class="s2">&quot;number of hash tables, where &quot;</span>
<span class="o">+</span> <span class="s2">&quot;increasing number of hash tables lowers the false negative rate, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;and decreasing it improves the running performance.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_LSHParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numHashTables or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numHashTables</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_LSH</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">JM</span><span class="p">],</span> <span class="n">_LSHParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">JM</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">setNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numHashTables`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_LSHModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_LSHParams</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH) models.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">approxNearestNeighbors</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">key</span><span class="p">:</span> <span class="n">Vector</span><span class="p">,</span>
<span class="n">numNearestNeighbors</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">distCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;distCol&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Given a large dataset and an item, approximately find at most k items which have the</span>
<span class="sd"> closest distance to the item. If the :py:attr:`outputCol` is missing, the method will</span>
<span class="sd"> transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows</span>
<span class="sd"> caching of the transformed data when necessary.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method is experimental and will likely change behavior in the next release.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dataset : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> The dataset to search for nearest neighbors of the key.</span>
<span class="sd"> key : :py:class:`pyspark.ml.linalg.Vector`</span>
<span class="sd"> Feature vector representing the item to search for.</span>
<span class="sd"> numNearestNeighbors : int</span>
<span class="sd"> The maximum number of nearest neighbors.</span>
<span class="sd"> distCol : str</span>
<span class="sd"> Output column for storing the distance between each result row and the key.</span>
<span class="sd"> Use &quot;distCol&quot; as default value if it&#39;s not specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A dataset containing at most k items closest to the key. A column &quot;distCol&quot; is</span>
<span class="sd"> added to show the distance between each row and the key.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;approxNearestNeighbors&quot;</span><span class="p">,</span> <span class="n">dataset</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">numNearestNeighbors</span><span class="p">,</span> <span class="n">distCol</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">approxSimilarityJoin</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">datasetA</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">datasetB</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">distCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;distCol&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Join two datasets to approximately find all pairs of rows whose distance are smaller than</span>
<span class="sd"> the threshold. If the :py:attr:`outputCol` is missing, the method will transform the data;</span>
<span class="sd"> if the :py:attr:`outputCol` exists, it will use that. This allows caching of the</span>
<span class="sd"> transformed data when necessary.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> datasetA : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> One of the datasets to join.</span>
<span class="sd"> datasetB : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> Another dataset to join.</span>
<span class="sd"> threshold : float</span>
<span class="sd"> The threshold for the distance of row pairs.</span>
<span class="sd"> distCol : str, optional</span>
<span class="sd"> Output column for storing the distance between each pair of rows. Use</span>
<span class="sd"> &quot;distCol&quot; as default value if it&#39;s not specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A joined dataset containing pairs of rows. The original rows are in columns</span>
<span class="sd"> &quot;datasetA&quot; and &quot;datasetB&quot;, and a column &quot;distCol&quot; is added to show the distance</span>
<span class="sd"> between each pair.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">threshold</span> <span class="o">=</span> <span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">(</span><span class="n">threshold</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;approxSimilarityJoin&quot;</span><span class="p">,</span> <span class="n">datasetA</span><span class="p">,</span> <span class="n">datasetB</span><span class="p">,</span> <span class="n">threshold</span><span class="p">,</span> <span class="n">distCol</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_BucketedRandomProjectionLSHParams</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`BucketedRandomProjectionLSH` and</span>
<span class="sd"> :py:class:`BucketedRandomProjectionLSHModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">bucketLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;bucketLength&quot;</span><span class="p">,</span>
<span class="s2">&quot;the length of each hash bucket, &quot;</span> <span class="o">+</span> <span class="s2">&quot;a larger bucket lowers the false negative rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of bucketLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Params</span><span class="p">,</span> <span class="bp">self</span><span class="p">))</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bucketLength</span><span class="p">)</span>
<div class="viewcode-block" id="BucketedRandomProjectionLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">BucketedRandomProjectionLSH</span><span class="p">(</span>
<span class="n">_LSH</span><span class="p">[</span><span class="s2">&quot;BucketedRandomProjectionLSHModel&quot;</span><span class="p">],</span>
<span class="n">_LSHParams</span><span class="p">,</span>
<span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;BucketedRandomProjectionLSH&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> LSH class for Euclidean distance metrics.</span>
<span class="sd"> The input is dense or sparse vectors, each of which represents a point in the Euclidean</span>
<span class="sd"> distance space. The output will be vectors of configurable dimension. Hash values in the same</span>
<span class="sd"> dimension are calculated by the same hash function.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - `Stable Distributions in Wikipedia article on Locality-sensitive hashing \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions&gt;`_</span>
<span class="sd"> - `Hashing for Similarity Search: A Survey &lt;https://arxiv.org/abs/1408.2927&gt;`_</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.dense([-1.0, -1.0 ]),),</span>
<span class="sd"> ... (1, Vectors.dense([-1.0, 1.0 ]),),</span>
<span class="sd"> ... (2, Vectors.dense([1.0, -1.0 ]),),</span>
<span class="sd"> ... (3, Vectors.dense([1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; brp = BucketedRandomProjectionLSH()</span>
<span class="sd"> &gt;&gt;&gt; brp.setInputCol(&quot;features&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setSeed(12345)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setBucketLength(1.0)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; model = brp.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getBucketLength()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSHModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head()</span>
<span class="sd"> Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])])</span>
<span class="sd"> &gt;&gt;&gt; data2 = [(4, Vectors.dense([2.0, 2.0 ]),),</span>
<span class="sd"> ... (5, Vectors.dense([2.0, 3.0 ]),),</span>
<span class="sd"> ... (6, Vectors.dense([3.0, 2.0 ]),),</span>
<span class="sd"> ... (7, Vectors.dense([3.0, 3.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(data2, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()</span>
<span class="sd"> [Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 3.0, distCol=&quot;EuclideanDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;EuclideanDistance&quot;)).show()</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> |idA|idB|EuclideanDistance|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> | 3| 6| 2.23606797749979|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 3, distCol=&quot;EuclideanDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;EuclideanDistance&quot;)).show()</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> |idA|idB|EuclideanDistance|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> | 3| 6| 2.23606797749979|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; brpPath = temp_path + &quot;/brp&quot;</span>
<span class="sd"> &gt;&gt;&gt; brp.save(brpPath)</span>
<span class="sd"> &gt;&gt;&gt; brp2 = BucketedRandomProjectionLSH.load(brpPath)</span>
<span class="sd"> &gt;&gt;&gt; brp2.getBucketLength() == brp.getBucketLength()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/brp-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model2 = BucketedRandomProjectionLSHModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().hashes == model2.transform(df).head().hashes</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">bucketLength</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span>
<span class="sd"> bucketLength=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">BucketedRandomProjectionLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.BucketedRandomProjectionLSH&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">bucketLength</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BucketedRandomProjectionLSH&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span>
<span class="sd"> bucketLength=None)</span>
<span class="sd"> Sets params for this BucketedRandomProjectionLSH.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setBucketLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setBucketLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BucketedRandomProjectionLSH&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`bucketLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">bucketLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BucketedRandomProjectionLSH&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BucketedRandomProjectionLSHModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">BucketedRandomProjectionLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSHModel.html#pyspark.ml.feature.BucketedRandomProjectionLSHModel">[docs]</a><span class="k">class</span> <span class="nc">BucketedRandomProjectionLSHModel</span><span class="p">(</span>
<span class="n">_LSHModel</span><span class="p">,</span>
<span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;BucketedRandomProjectionLSHModel&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are</span>
<span class="sd"> stored. The vectors are normalized to be unit vectors and each vector is used in a hash</span>
<span class="sd"> function: :math:`h_i(x) = floor(r_i \cdot x / bucketLength)` where :math:`r_i` is the</span>
<span class="sd"> i-th random unit vector. The number of buckets will be `(max L2 norm of input vectors) /</span>
<span class="sd"> bucketLength`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="Bucketizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Bucketizer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Bucketizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps a column of continuous features to a column of feature buckets. Since 3.0.0,</span>
<span class="sd"> :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`</span>
<span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span>
<span class="sd"> are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single</span>
<span class="sd"> column usage, and :py:attr:`splitsArray` is for multiple columns.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float(&quot;nan&quot;)),</span>
<span class="sd"> ... (float(&quot;nan&quot;), 1.0), (float(&quot;nan&quot;), 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(values, [&quot;values1&quot;, &quot;values2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; bucketizer = Bucketizer()</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setSplits([-float(&quot;inf&quot;), 0.5, 1.4, float(&quot;inf&quot;)])</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setInputCol(&quot;values1&quot;)</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setOutputCol(&quot;buckets&quot;)</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;keep&quot;).transform(df).collect()</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;keep&quot;).transform(df.select(&quot;values1&quot;))</span>
<span class="sd"> &gt;&gt;&gt; bucketed.show(truncate=False)</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> |values1|buckets|</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> |0.1 |0.0 |</span>
<span class="sd"> |0.4 |0.0 |</span>
<span class="sd"> |1.2 |1.0 |</span>
<span class="sd"> |1.5 |2.0 |</span>
<span class="sd"> |NaN |3.0 |</span>
<span class="sd"> |NaN |3.0 |</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setParams(outputCol=&quot;b&quot;).transform(df).head().b</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; bucketizerPath = temp_path + &quot;/bucketizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.save(bucketizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer = Bucketizer.load(bucketizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer.getSplits() == bucketizer.getSplits()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;skip&quot;).transform(df).collect()</span>
<span class="sd"> &gt;&gt;&gt; len(bucketed)</span>
<span class="sd"> 4</span>
<span class="sd"> &gt;&gt;&gt; bucketizer2 = Bucketizer(splitsArray=</span>
<span class="sd"> ... [[-float(&quot;inf&quot;), 0.5, 1.4, float(&quot;inf&quot;)], [-float(&quot;inf&quot;), 0.5, float(&quot;inf&quot;)]],</span>
<span class="sd"> ... inputCols=[&quot;values1&quot;, &quot;values2&quot;], outputCols=[&quot;buckets1&quot;, &quot;buckets2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; bucketed2 = bucketizer2.setHandleInvalid(&quot;keep&quot;).transform(df)</span>
<span class="sd"> &gt;&gt;&gt; bucketed2.show(truncate=False)</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> |values1|values2|buckets1|buckets2|</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> |0.1 |0.0 |0.0 |0.0 |</span>
<span class="sd"> |0.4 |1.0 |0.0 |1.0 |</span>
<span class="sd"> |1.2 |1.3 |1.0 |1.0 |</span>
<span class="sd"> |1.5 |NaN |2.0 |2.0 |</span>
<span class="sd"> |NaN |1.0 |3.0 |1.0 |</span>
<span class="sd"> |NaN |0.0 |3.0 |0.0 |</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">splits</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;splits&quot;</span><span class="p">,</span>
<span class="s2">&quot;Split points for mapping continuous features into buckets. With n+1 splits, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;there are n buckets. A bucket defined by splits x,y holds values in the &quot;</span>
<span class="o">+</span> <span class="s2">&quot;range [x,y) except the last bucket, which also includes y. The splits &quot;</span>
<span class="o">+</span> <span class="s2">&quot;should be of length &gt;= 3 and strictly increasing. Values at -inf, inf must be &quot;</span>
<span class="o">+</span> <span class="s2">&quot;explicitly provided to cover all Double values; otherwise, values outside the &quot;</span>
<span class="o">+</span> <span class="s2">&quot;splits specified will be treated as errors.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;how to handle invalid entries &quot;</span>
<span class="s2">&quot;containing NaN values. Values outside the splits will always be treated &quot;</span>
<span class="s2">&quot;as errors. Options are &#39;skip&#39; (filter out rows with invalid values), &quot;</span>
<span class="o">+</span> <span class="s2">&quot;&#39;error&#39; (throw an error), or &#39;keep&#39; (keep invalid values in a &quot;</span>
<span class="o">+</span> <span class="s2">&quot;special additional bucket). Note that in the multiple column &quot;</span>
<span class="o">+</span> <span class="s2">&quot;case, the invalid handling is applied to all columns. That said &quot;</span>
<span class="o">+</span> <span class="s2">&quot;for &#39;error&#39; it will throw an error if any invalids are found in &quot;</span>
<span class="o">+</span> <span class="s2">&quot;any column, for &#39;skip&#39; it will skip rows with any invalids in &quot;</span>
<span class="o">+</span> <span class="s2">&quot;any columns, etc.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">splitsArray</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;splitsArray&quot;</span><span class="p">,</span>
<span class="s2">&quot;The array of split points for mapping &quot;</span>
<span class="o">+</span> <span class="s2">&quot;continuous features into buckets for multiple columns. For each input &quot;</span>
<span class="o">+</span> <span class="s2">&quot;column, with n+1 splits, there are n buckets. A bucket defined by &quot;</span>
<span class="o">+</span> <span class="s2">&quot;splits x,y holds values in the range [x,y) except the last bucket, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;which also includes y. The splits should be of length &gt;= 3 and &quot;</span>
<span class="o">+</span> <span class="s2">&quot;strictly increasing. Values at -inf, inf must be explicitly provided &quot;</span>
<span class="o">+</span> <span class="s2">&quot;to cover all Double values; otherwise, values outside the splits &quot;</span>
<span class="o">+</span> <span class="s2">&quot;specified will be treated as errors.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListListFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Bucketizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Bucketizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="Bucketizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this Bucketizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`splits`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.getSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of threshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splits</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`splitsArray`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splitsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.getSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the array of split points or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splitsArray</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Bucketizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_CountVectorizerParams</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`CountVectorizer` and :py:class:`CountVectorizerModel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">minTF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minTF&quot;</span><span class="p">,</span>
<span class="s2">&quot;Filter to ignore rare words in&quot;</span>
<span class="o">+</span> <span class="s2">&quot; a document. For each document, terms with frequency/count less than the given&quot;</span>
<span class="o">+</span> <span class="s2">&quot; threshold are ignored. If this is an integer &gt;= 1, then this specifies a count (of&quot;</span>
<span class="o">+</span> <span class="s2">&quot; times the term must appear in the document); if this is a double in [0,1), then this &quot;</span>
<span class="o">+</span> <span class="s2">&quot;specifies a fraction (out of the document&#39;s token count). Note that the parameter is &quot;</span>
<span class="o">+</span> <span class="s2">&quot;only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">minDF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minDF&quot;</span><span class="p">,</span>
<span class="s2">&quot;Specifies the minimum number of&quot;</span>
<span class="o">+</span> <span class="s2">&quot; different documents a term must appear in to be included in the vocabulary.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; If this is an integer &gt;= 1, this specifies the number of documents the term must&quot;</span>
<span class="o">+</span> <span class="s2">&quot; appear in; if this is a double in [0,1), then this specifies the fraction of documents.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; Default 1.0&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">maxDF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;maxDF&quot;</span><span class="p">,</span>
<span class="s2">&quot;Specifies the maximum number of&quot;</span>
<span class="o">+</span> <span class="s2">&quot; different documents a term could appear in to be included in the vocabulary.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; A term that appears more than the threshold will be ignored. If this is an&quot;</span>
<span class="o">+</span> <span class="s2">&quot; integer &gt;= 1, this specifies the maximum number of documents the term could appear in;&quot;</span>
<span class="o">+</span> <span class="s2">&quot; if this is a double in [0,1), then this specifies the maximum&quot;</span>
<span class="o">+</span> <span class="s2">&quot; fraction of documents the term could appear in.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; Default (2^63) - 1&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">vocabSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;vocabSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;max size of the vocabulary. Default 1 &lt;&lt; 18.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">binary</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;binary&quot;</span><span class="p">,</span>
<span class="s2">&quot;Binary toggle to control the output vector values.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful&quot;</span>
<span class="o">+</span> <span class="s2">&quot; for discrete probabilistic models that model binary events rather than integer counts.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; Default False&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">maxDF</span><span class="o">=</span><span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minTF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minDF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxDF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxDF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of vocabSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">CountVectorizer</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">],</span>
<span class="n">_CountVectorizerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;CountVectorizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(0, [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]), (1, [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;c&quot;, &quot;a&quot;])],</span>
<span class="sd"> ... [&quot;label&quot;, &quot;raw&quot;])</span>
<span class="sd"> &gt;&gt;&gt; cv = CountVectorizer()</span>
<span class="sd"> &gt;&gt;&gt; cv.setInputCol(&quot;raw&quot;)</span>
<span class="sd"> CountVectorizer...</span>
<span class="sd"> &gt;&gt;&gt; cv.setOutputCol(&quot;vectors&quot;)</span>
<span class="sd"> CountVectorizer...</span>
<span class="sd"> &gt;&gt;&gt; model = cv.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;raw&quot;)</span>
<span class="sd"> CountVectorizerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show(truncate=False)</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |label|raw |vectors |</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span>
<span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; sorted(model.vocabulary) == [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; countVectorizerPath = temp_path + &quot;/count-vectorizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; cv.save(countVectorizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedCv = CountVectorizer.load(countVectorizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getMinDF() == cv.getMinDF()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getMinTF() == cv.getMinTF()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getVocabSize() == cv.getVocabSize()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/count-vectorizer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = CountVectorizerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.vocabulary == model.vocabulary</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; fromVocabModel = CountVectorizerModel.from_vocabulary([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],</span>
<span class="sd"> ... inputCol=&quot;raw&quot;, outputCol=&quot;vectors&quot;)</span>
<span class="sd"> &gt;&gt;&gt; fromVocabModel.transform(df).show(truncate=False)</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |label|raw |vectors |</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span>
<span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minTF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">minDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">maxDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">vocabSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 &lt;&lt; 18,\</span>
<span class="sd"> binary=False, inputCol=None,outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">CountVectorizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.CountVectorizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minTF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">minDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">maxDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">vocabSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 &lt;&lt; 18,\</span>
<span class="sd"> binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Set the params for the CountVectorizer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMinDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minDF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMaxDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMaxDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxDF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setVocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setVocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`vocabSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">CountVectorizerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">CountVectorizerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`CountVectorizer`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="CountVectorizerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.from_vocabulary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.from_vocabulary">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_vocabulary</span><span class="p">(</span>
<span class="bp">cls</span><span class="p">,</span>
<span class="n">vocabulary</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">minTF</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">binary</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from a vocabulary list of strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jvocab</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.CountVectorizerModel&quot;</span><span class="p">,</span> <span class="n">jvocab</span>
<span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">minTF</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setMinTF</span><span class="p">(</span><span class="n">minTF</span><span class="p">)</span>
<span class="k">if</span> <span class="n">binary</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setBinary</span><span class="p">(</span><span class="n">binary</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">))</span>
<span class="k">return</span> <span class="n">model</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">vocabulary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> An array of terms in the vocabulary.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;vocabulary&quot;</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizerModel.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;CountVectorizerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="DCT"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">DCT</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;DCT&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that takes the 1D discrete cosine transform</span>
<span class="sd"> of a real vector. No zero padding is performed on the input vector.</span>
<span class="sd"> It returns a real vector of the same length representing the DCT.</span>
<span class="sd"> The return vector is scaled such that the transform matrix is</span>
<span class="sd"> unitary (aka scaled DCT-II).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `More information on Wikipedia \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia&gt;`_.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], [&quot;vec&quot;])</span>
<span class="sd"> &gt;&gt;&gt; dct = DCT( )</span>
<span class="sd"> &gt;&gt;&gt; dct.setInverse(False)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; dct.setInputCol(&quot;vec&quot;)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; dct.setOutputCol(&quot;resultVec&quot;)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; df2 = dct.transform(df1)</span>
<span class="sd"> &gt;&gt;&gt; df2.head().resultVec</span>
<span class="sd"> DenseVector([10.969..., -0.707..., -2.041...])</span>
<span class="sd"> &gt;&gt;&gt; df3 = DCT(inverse=True, inputCol=&quot;resultVec&quot;, outputCol=&quot;origVec&quot;).transform(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.head().origVec</span>
<span class="sd"> DenseVector([5.0, 8.0, 6.0])</span>
<span class="sd"> &gt;&gt;&gt; dctPath = temp_path + &quot;/dct&quot;</span>
<span class="sd"> &gt;&gt;&gt; dct.save(dctPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc = DCT.load(dctPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc.getInverse()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">inverse</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;inverse&quot;</span><span class="p">,</span>
<span class="s2">&quot;Set transformer to perform inverse DCT, &quot;</span> <span class="o">+</span> <span class="s2">&quot;default False.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inverse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inverse=False, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">DCT</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.DCT&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="DCT.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inverse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DCT&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inverse=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this DCT.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DCT&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inverse`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.getInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.getInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of inverse or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inverse</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DCT&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DCT&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="ElementwiseProduct"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">ElementwiseProduct</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Outputs the Hadamard product (i.e., the element-wise product) of each input vector</span>
<span class="sd"> with a provided &quot;weight&quot; vector. In other words, it scales each column of the dataset</span>
<span class="sd"> by a scalar multiplier.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ep = ElementwiseProduct()</span>
<span class="sd"> &gt;&gt;&gt; ep.setScalingVec(Vectors.dense([1.0, 2.0, 3.0]))</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.setInputCol(&quot;values&quot;)</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.setOutputCol(&quot;eprod&quot;)</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.transform(df).head().eprod</span>
<span class="sd"> DenseVector([2.0, 2.0, 9.0])</span>
<span class="sd"> &gt;&gt;&gt; ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod</span>
<span class="sd"> DenseVector([4.0, 3.0, 15.0])</span>
<span class="sd"> &gt;&gt;&gt; elementwiseProductPath = temp_path + &quot;/elementwise-product&quot;</span>
<span class="sd"> &gt;&gt;&gt; ep.save(elementwiseProductPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedEp = ElementwiseProduct.load(elementwiseProductPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedEp.getScalingVec() == ep.getScalingVec()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedEp.transform(df).take(1) == ep.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">scalingVec</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;scalingVec&quot;</span><span class="p">,</span>
<span class="s2">&quot;Vector for hadamard product.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toVector</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">scalingVec</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">ElementwiseProduct</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.ElementwiseProduct&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="ElementwiseProduct.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">scalingVec</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this ElementwiseProduct.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`scalingVec`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.getScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.getScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of scalingVec or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ElementwiseProduct&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="FeatureHasher"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">FeatureHasher</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasNumFeatures</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;FeatureHasher&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature hashing projects a set of categorical or numerical features into a feature vector of</span>
<span class="sd"> specified dimension (typically substantially smaller than that of the original feature</span>
<span class="sd"> space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)</span>
<span class="sd"> to map features to indices in the feature vector.</span>
<span class="sd"> The FeatureHasher transformer operates on multiple columns. Each column may contain either</span>
<span class="sd"> numeric or categorical features. Behavior and handling of column data types is as follows:</span>
<span class="sd"> * Numeric columns:</span>
<span class="sd"> For numeric features, the hash value of the column name is used to map the</span>
<span class="sd"> feature value to its index in the feature vector. By default, numeric features</span>
<span class="sd"> are not treated as categorical (even when they are integers). To treat them</span>
<span class="sd"> as categorical, specify the relevant columns in `categoricalCols`.</span>
<span class="sd"> * String columns:</span>
<span class="sd"> For categorical features, the hash value of the string &quot;column_name=value&quot;</span>
<span class="sd"> is used to map to the vector index, with an indicator value of `1.0`.</span>
<span class="sd"> Thus, categorical features are &quot;one-hot&quot; encoded</span>
<span class="sd"> (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).</span>
<span class="sd"> * Boolean columns:</span>
<span class="sd"> Boolean values are treated in the same way as string columns. That is,</span>
<span class="sd"> boolean features are represented as &quot;column_name=true&quot; or &quot;column_name=false&quot;,</span>
<span class="sd"> with an indicator value of `1.0`.</span>
<span class="sd"> Null (missing) values are ignored (implicitly zero in the resulting feature vector).</span>
<span class="sd"> Since a simple modulo is used to transform the hash function to a vector index,</span>
<span class="sd"> it is advisable to use a power of two as the `numFeatures` parameter;</span>
<span class="sd"> otherwise the features will not be mapped evenly to the vector indices.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(2.0, True, &quot;1&quot;, &quot;foo&quot;), (3.0, False, &quot;2&quot;, &quot;bar&quot;)]</span>
<span class="sd"> &gt;&gt;&gt; cols = [&quot;real&quot;, &quot;bool&quot;, &quot;stringNum&quot;, &quot;string&quot;]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, cols)</span>
<span class="sd"> &gt;&gt;&gt; hasher = FeatureHasher()</span>
<span class="sd"> &gt;&gt;&gt; hasher.setInputCols(cols)</span>
<span class="sd"> FeatureHasher...</span>
<span class="sd"> &gt;&gt;&gt; hasher.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> FeatureHasher...</span>
<span class="sd"> &gt;&gt;&gt; hasher.transform(df).head().features</span>
<span class="sd"> SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hasher.setCategoricalCols([&quot;real&quot;]).transform(df).head().features</span>
<span class="sd"> SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hasherPath = temp_path + &quot;/hasher&quot;</span>
<span class="sd"> &gt;&gt;&gt; hasher.save(hasherPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher = FeatureHasher.load(hasherPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher.getNumFeatures() == hasher.getNumFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher.transform(df).head().features == hasher.transform(df).head().features</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;categoricalCols&quot;</span><span class="p">,</span>
<span class="s2">&quot;numeric columns to treat as categorical&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numFeatures=1 &lt;&lt; 18, inputCols=None, outputCol=None, \</span>
<span class="sd"> categoricalCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">FeatureHasher</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.FeatureHasher&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="FeatureHasher.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;FeatureHasher&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numFeatures=1 &lt;&lt; 18, inputCols=None, outputCol=None, \</span>
<span class="sd"> categoricalCols=None)</span>
<span class="sd"> Sets params for this FeatureHasher.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;FeatureHasher&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`categoricalCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">categoricalCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.getCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.getCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">categoricalCols</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;FeatureHasher&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;FeatureHasher&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;FeatureHasher&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numFeatures`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="HashingTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">HashingTF</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasNumFeatures</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;HashingTF&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps a sequence of terms to their term frequencies using the hashing trick.</span>
<span class="sd"> Currently we use Austin Appleby&#39;s MurmurHash 3 algorithm (MurmurHash3_x86_32)</span>
<span class="sd"> to calculate the hash code value for the term object.</span>
<span class="sd"> Since a simple modulo is used to transform the hash function to a column index,</span>
<span class="sd"> it is advisable to use a power of two as the numFeatures parameter;</span>
<span class="sd"> otherwise the features will not be mapped evenly to the columns.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&quot;words&quot;])</span>
<span class="sd"> &gt;&gt;&gt; hashingTF = HashingTF(inputCol=&quot;words&quot;, outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.setNumFeatures(10)</span>
<span class="sd"> HashingTF...</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.transform(df).head().features</span>
<span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; params = {hashingTF.numFeatures: 5, hashingTF.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.transform(df, params).head().vector</span>
<span class="sd"> SparseVector(5, {0: 1.0, 2: 1.0, 3: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hashingTFPath = temp_path + &quot;/hashing-tf&quot;</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.save(hashingTFPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF = HashingTF.load(hashingTFPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.indexOf(&quot;b&quot;)</span>
<span class="sd"> 5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">binary</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;binary&quot;</span><span class="p">,</span>
<span class="s2">&quot;If True, all non zero counts are set to 1. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;This is useful for discrete probabilistic models that model binary events &quot;</span>
<span class="o">+</span> <span class="s2">&quot;rather than integer counts. Default False.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numFeatures=1 &lt;&lt; 18, binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">HashingTF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.HashingTF&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="HashingTF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;HashingTF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numFeatures=1 &lt;&lt; 18, binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this HashingTF.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;HashingTF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.getBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.getBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;HashingTF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;HashingTF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;HashingTF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numFeatures`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.indexOf"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.indexOf">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">indexOf</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">term</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the index of the input term.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_transfer_params_to_java</span><span class="p">()</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">indexOf</span><span class="p">(</span><span class="n">term</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_IDFParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`IDF` and :py:class:`IDFModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">minDocFreq</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minDocFreq&quot;</span><span class="p">,</span>
<span class="s2">&quot;minimum number of documents in which a term should appear for filtering&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minDocFreq or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_IDFParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<div class="viewcode-block" id="IDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">IDF</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;IDFModel&quot;</span><span class="p">],</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;IDF&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute the Inverse Document Frequency (IDF) given a collection of documents.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import DenseVector</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),</span>
<span class="sd"> ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], [&quot;tf&quot;])</span>
<span class="sd"> &gt;&gt;&gt; idf = IDF(minDocFreq=3)</span>
<span class="sd"> &gt;&gt;&gt; idf.setInputCol(&quot;tf&quot;)</span>
<span class="sd"> IDF...</span>
<span class="sd"> &gt;&gt;&gt; idf.setOutputCol(&quot;idf&quot;)</span>
<span class="sd"> IDF...</span>
<span class="sd"> &gt;&gt;&gt; model = idf.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;idf&quot;)</span>
<span class="sd"> IDFModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getMinDocFreq()</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; model.idf</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.docFreq</span>
<span class="sd"> [0, 3]</span>
<span class="sd"> &gt;&gt;&gt; model.numDocs == df.count()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().idf</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; idf.setParams(outputCol=&quot;freqs&quot;).fit(df).transform(df).collect()[1].freqs</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {idf.minDocFreq: 1, idf.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; idf.fit(df, params).transform(df).head().vector</span>
<span class="sd"> DenseVector([0.2877, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; idfPath = temp_path + &quot;/idf&quot;</span>
<span class="sd"> &gt;&gt;&gt; idf.save(idfPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIdf = IDF.load(idfPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIdf.getMinDocFreq() == idf.getMinDocFreq()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/idf-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = IDFModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).head().idf == model.transform(df).head().idf</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minDocFreq</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">IDF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.IDF&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="IDF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minDocFreq</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this IDF.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setMinDocFreq"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setMinDocFreq">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minDocFreq`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDF&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDFModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">IDFModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDFModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel">[docs]</a><span class="k">class</span> <span class="nc">IDFModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;IDFModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`IDF`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="IDFModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDFModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDFModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IDFModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">idf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the IDF vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;idf&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">docFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the document frequency.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;docFreq&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">numDocs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns number of documents evaluated to compute idf</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;numDocs&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_ImputerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Imputer` and :py:class:`ImputerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">strategy</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;strategy&quot;</span><span class="p">,</span>
<span class="s2">&quot;strategy for imputation. If mean, then replace missing values using the mean &quot;</span>
<span class="s2">&quot;value of the feature. If median, then replace missing values using the &quot;</span>
<span class="s2">&quot;median value of the feature. If mode, then replace missing using the most &quot;</span>
<span class="s2">&quot;frequent value of the feature.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;missingValue&quot;</span><span class="p">,</span>
<span class="s2">&quot;The placeholder for the missing values. All occurrences of missingValue &quot;</span>
<span class="s2">&quot;will be imputed.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_ImputerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">missingValue</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`strategy` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">strategy</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`missingValue` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">missingValue</span><span class="p">)</span>
<div class="viewcode-block" id="Imputer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Imputer</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;ImputerModel&quot;</span><span class="p">],</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Imputer&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Imputation estimator for completing missing values, using the mean, median or mode</span>
<span class="sd"> of the columns in which the missing values are located. The input columns should be of</span>
<span class="sd"> numeric type. Currently Imputer does not support categorical features and</span>
<span class="sd"> possibly creates incorrect values for a categorical feature.</span>
<span class="sd"> Note that the mean/median/mode value is computed after filtering out missing values.</span>
<span class="sd"> All Null values in the input columns are treated as missing, and so are also imputed. For</span>
<span class="sd"> computing median, :py:meth:`pyspark.sql.DataFrame.approxQuantile` is used with a</span>
<span class="sd"> relative error of `0.001`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&quot;nan&quot;)), (2.0, float(&quot;nan&quot;)), (float(&quot;nan&quot;), 3.0),</span>
<span class="sd"> ... (4.0, 4.0), (5.0, 5.0)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer = Imputer()</span>
<span class="sd"> &gt;&gt;&gt; imputer.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> Imputer...</span>
<span class="sd"> &gt;&gt;&gt; imputer.setOutputCols([&quot;out_a&quot;, &quot;out_b&quot;])</span>
<span class="sd"> Imputer...</span>
<span class="sd"> &gt;&gt;&gt; imputer.getRelativeError()</span>
<span class="sd"> 0.001</span>
<span class="sd"> &gt;&gt;&gt; model = imputer.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> ImputerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getStrategy()</span>
<span class="sd"> &#39;mean&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.surrogateDF.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> |3.0|4.0|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> | a| b|out_a|out_b|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> |1.0|NaN| 1.0| 4.0|</span>
<span class="sd"> |2.0|NaN| 2.0| 4.0|</span>
<span class="sd"> |NaN|3.0| 3.0| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df).transform(df).show()</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> | a| b|out_a|out_b|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> |1.0|NaN| 4.0| NaN|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1.0,), (2.0,), (float(&quot;nan&quot;),), (4.0,), (5.0,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer1 = Imputer(inputCol=&quot;a&quot;, outputCol=&quot;out_a&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model1 = imputer1.fit(df1)</span>
<span class="sd"> &gt;&gt;&gt; model1.surrogateDF.show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | a|</span>
<span class="sd"> +---+</span>
<span class="sd"> |3.0|</span>
<span class="sd"> +---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model1.transform(df1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a|out_a|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |1.0| 1.0|</span>
<span class="sd"> |2.0| 2.0|</span>
<span class="sd"> |NaN| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer1.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df1).transform(df1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a|out_a|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |1.0| 4.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(float(&quot;nan&quot;),), (float(&quot;nan&quot;),), (3.0,), (4.0,), (5.0,)],</span>
<span class="sd"> ... [&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer2 = Imputer(inputCol=&quot;b&quot;, outputCol=&quot;out_b&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model2 = imputer2.fit(df2)</span>
<span class="sd"> &gt;&gt;&gt; model2.surrogateDF.show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | b|</span>
<span class="sd"> +---+</span>
<span class="sd"> |4.0|</span>
<span class="sd"> +---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model2.transform(df2).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | b|out_b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |NaN| 4.0|</span>
<span class="sd"> |NaN| 4.0|</span>
<span class="sd"> |3.0| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer2.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df2).transform(df2).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | b|out_b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |NaN| NaN|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputerPath = temp_path + &quot;/imputer&quot;</span>
<span class="sd"> &gt;&gt;&gt; imputer.save(imputerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer = Imputer.load(imputerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer.getStrategy() == imputer.getStrategy()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer.getMissingValue()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/imputer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = ImputerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).head().out_a == model.transform(df).head().out_a</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, strategy=&quot;mean&quot;, missingValue=float(&quot;nan&quot;), inputCols=None, \</span>
<span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Imputer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Imputer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="Imputer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span>
<span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, strategy=&quot;mean&quot;, missingValue=float(&quot;nan&quot;), inputCols=None, \</span>
<span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> Sets params for this Imputer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setStrategy"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setStrategy">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`strategy`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setMissingValue"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setMissingValue">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`missingValue`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">missingValue</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Imputer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ImputerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ImputerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel">[docs]</a><span class="k">class</span> <span class="nc">ImputerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;ImputerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`Imputer`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="ImputerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;ImputerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;ImputerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ImputerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ImputerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">surrogateDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a DataFrame containing inputCols and their corresponding surrogates,</span>
<span class="sd"> which are used to replace the missing values in the input DataFrame.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;surrogateDF&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Interaction</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Interaction&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the feature interaction transform. This transformer takes in Double and Vector type</span>
<span class="sd"> columns and outputs a flattened vector of their feature interactions. To handle interaction,</span>
<span class="sd"> we first one-hot encode any nominal features. Then, a vector of the feature cross-products is</span>
<span class="sd"> produced.</span>
<span class="sd"> For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be</span>
<span class="sd"> `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal</span>
<span class="sd"> with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; interaction = Interaction()</span>
<span class="sd"> &gt;&gt;&gt; interaction.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> Interaction...</span>
<span class="sd"> &gt;&gt;&gt; interaction.setOutputCol(&quot;ab&quot;)</span>
<span class="sd"> Interaction...</span>
<span class="sd"> &gt;&gt;&gt; interaction.transform(df).show()</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | a| b| ab|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> |0.0|1.0|[0.0]|</span>
<span class="sd"> |2.0|3.0|[6.0]|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; interactionPath = temp_path + &quot;/interaction&quot;</span>
<span class="sd"> &gt;&gt;&gt; interaction.save(interactionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInteraction = Interaction.load(interactionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None):</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Interaction</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Interaction&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Interaction.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Interaction&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None)</span>
<span class="sd"> Sets params for this Interaction.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Interaction&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Interaction&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_MaxAbsScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">pass</span>
<div class="viewcode-block" id="MaxAbsScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MaxAbsScaler</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">],</span>
<span class="n">_MaxAbsScalerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;MaxAbsScaler&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Rescale each feature individually to range [-1, 1] by dividing through the largest maximum</span>
<span class="sd"> absolute value in each feature. It does not shift/center the data, and thus does not destroy</span>
<span class="sd"> any sparsity.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; maScaler = MaxAbsScaler(outputCol=&quot;scaled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; maScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> MaxAbsScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = maScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;scaledOutput&quot;)</span>
<span class="sd"> MaxAbsScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | a|scaledOutput|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |[1.0]| [0.5]|</span>
<span class="sd"> |[2.0]| [1.0]|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; scalerPath = temp_path + &quot;/max-abs-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; maScaler.save(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler = MaxAbsScaler.load(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler.getInputCol() == maScaler.getInputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler.getOutputCol() == maScaler.getOutputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/max-abs-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = MaxAbsScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.maxAbs == model.maxAbs</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MaxAbsScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MaxAbsScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MaxAbsScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this MaxAbsScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MaxAbsScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MaxAbsScalerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MaxAbsScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`MaxAbsScaler`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="MaxAbsScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MaxAbsScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">maxAbs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Max Abs vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;maxAbs&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MinHashLSH</span><span class="p">(</span>
<span class="n">_LSH</span><span class="p">[</span><span class="s2">&quot;MinHashLSHModel&quot;</span><span class="p">],</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;MinHashLSH&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> LSH class for Jaccard distance.</span>
<span class="sd"> The input can be dense or sparse vectors, but it is more efficient if it is sparse.</span>
<span class="sd"> For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements</span>
<span class="sd"> in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at</span>
<span class="sd"> least 1 non-zero index, and all non-zero values are treated as binary &quot;1&quot; values.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See `Wikipedia on MinHash &lt;https://en.wikipedia.org/wiki/MinHash&gt;`_</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; mh = MinHashLSH()</span>
<span class="sd"> &gt;&gt;&gt; mh.setInputCol(&quot;features&quot;)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; mh.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; mh.setSeed(12345)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; model = mh.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;features&quot;)</span>
<span class="sd"> MinHashLSHModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head()</span>
<span class="sd"> Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668...</span>
<span class="sd"> &gt;&gt;&gt; data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(data2, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; key = Vectors.sparse(6, [1, 2], [1.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; model.approxNearestNeighbors(df2, key, 1).collect()</span>
<span class="sd"> [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668...</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 0.6, distCol=&quot;JaccardDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;JaccardDistance&quot;)).show()</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> |idA|idB|JaccardDistance|</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> | 0| 5| 0.5|</span>
<span class="sd"> | 1| 4| 0.5|</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; mhPath = temp_path + &quot;/mh&quot;</span>
<span class="sd"> &gt;&gt;&gt; mh.save(mhPath)</span>
<span class="sd"> &gt;&gt;&gt; mh2 = MinHashLSH.load(mhPath)</span>
<span class="sd"> &gt;&gt;&gt; mh2.getOutputCol() == mh.getOutputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/mh-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model2 = MinHashLSHModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().hashes == model2.transform(df).head().hashes</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MinHashLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MinHashLSH&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MinHashLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinHashLSH&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span>
<span class="sd"> Sets params for this MinHashLSH.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinHashLSH&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinHashLSHModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MinHashLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSHModel.html#pyspark.ml.feature.MinHashLSHModel">[docs]</a><span class="k">class</span> <span class="nc">MinHashLSHModel</span><span class="p">(</span><span class="n">_LSHModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each</span>
<span class="sd"> hash function is picked from the following family of hash functions, where :math:`a_i` and</span>
<span class="sd"> :math:`b_i` are randomly chosen integers less than prime:</span>
<span class="sd"> :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise</span>
<span class="sd"> independent according to the reference.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See Tom Bohman, Colin Cooper, and Alan Frieze. &quot;Min-wise independent linear permutations.&quot;</span>
<span class="sd"> Electronic Journal of Combinatorics 7 (2000): R26.</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<span class="k">class</span> <span class="nc">_MinMaxScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">min</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;min&quot;</span><span class="p">,</span>
<span class="s2">&quot;Lower bound of the output feature range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="nb">max</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;max&quot;</span><span class="p">,</span>
<span class="s2">&quot;Upper bound of the output feature range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of min or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">min</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of max or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">max</span><span class="p">)</span>
<div class="viewcode-block" id="MinMaxScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MinMaxScaler</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">],</span>
<span class="n">_MinMaxScalerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Rescale each feature individually to a common range [min, max] linearly using column summary</span>
<span class="sd"> statistics, which is also known as min-max normalization or Rescaling. The rescaled value for</span>
<span class="sd"> feature E is calculated as,</span>
<span class="sd"> Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min</span>
<span class="sd"> For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Since zero values will probably be transformed to non-zero values, output of the</span>
<span class="sd"> transformer will be DenseVector even for sparse input.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; mmScaler = MinMaxScaler(outputCol=&quot;scaled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; mmScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> MinMaxScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = mmScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;scaledOutput&quot;)</span>
<span class="sd"> MinMaxScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.originalMin</span>
<span class="sd"> DenseVector([0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.originalMax</span>
<span class="sd"> DenseVector([2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | a|scaledOutput|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |[0.0]| [0.0]|</span>
<span class="sd"> |[2.0]| [1.0]|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; minMaxScalerPath = temp_path + &quot;/min-max-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; mmScaler.save(minMaxScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler = MinMaxScaler.load(minMaxScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler.getMin() == mmScaler.getMin()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler.getMax() == mmScaler.getMax()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/min-max-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = MinMaxScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.originalMin == model.originalMin</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.originalMax == model.originalMax</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="nb">min</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="nb">max</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MinMaxScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MinMaxScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MinMaxScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="nb">min</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="nb">max</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this MinMaxScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`min`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`max`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MinMaxScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MinMaxScalerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`MinMaxScaler`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="MinMaxScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`min`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;MinMaxScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`max`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">originalMin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Min value for each original column during fitting.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;originalMin&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">originalMax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Max value for each original column during fitting.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;originalMax&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">NGram</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;NGram&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that converts the input array of strings into an array of n-grams. Null</span>
<span class="sd"> values in the input array are ignored.</span>
<span class="sd"> It returns an array of n-grams where each n-gram is represented by a space-separated string of</span>
<span class="sd"> words.</span>
<span class="sd"> When the input is empty, an empty array is returned.</span>
<span class="sd"> When the input array length is less than n (number of elements per n-gram), no n-grams are</span>
<span class="sd"> returned.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(inputTokens=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;, &quot;d&quot;, &quot;e&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; ngram = NGram(n=2)</span>
<span class="sd"> &gt;&gt;&gt; ngram.setInputCol(&quot;inputTokens&quot;)</span>
<span class="sd"> NGram...</span>
<span class="sd"> &gt;&gt;&gt; ngram.setOutputCol(&quot;nGrams&quot;)</span>
<span class="sd"> NGram...</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b&#39;, &#39;b c&#39;, &#39;c d&#39;, &#39;d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change n-gram length</span>
<span class="sd"> &gt;&gt;&gt; ngram.setParams(n=4).transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify output column.</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df, {ngram.outputCol: &quot;output&quot;}).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], output=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; ngram.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; ngramPath = temp_path + &quot;/ngram&quot;</span>
<span class="sd"> &gt;&gt;&gt; ngram.save(ngramPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram = NGram.load(ngramPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram.getN() == ngram.getN()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram.transform(df).take(1) == ngram.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">n</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;n&quot;</span><span class="p">,</span>
<span class="s2">&quot;number of elements per n-gram (&gt;=1)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">NGram</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.NGram&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="NGram.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;NGram&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this NGram.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setN</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;NGram&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`n`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.getN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.getN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getN</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of n or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;NGram&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;NGram&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="Normalizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Normalizer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Normalizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Normalize a vector to have unit norm using the given p-norm.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], [&quot;dense&quot;, &quot;sparse&quot;])</span>
<span class="sd"> &gt;&gt;&gt; normalizer = Normalizer(p=2.0)</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setInputCol(&quot;dense&quot;)</span>
<span class="sd"> Normalizer...</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> Normalizer...</span>
<span class="sd"> &gt;&gt;&gt; normalizer.transform(df).head().features</span>
<span class="sd"> DenseVector([0.6, -0.8])</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setParams(inputCol=&quot;sparse&quot;, outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> SparseVector(4, {1: 0.8, 3: 0.6})</span>
<span class="sd"> &gt;&gt;&gt; params = {normalizer.p: 1.0, normalizer.inputCol: &quot;dense&quot;, normalizer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; normalizer.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.4286, -0.5714])</span>
<span class="sd"> &gt;&gt;&gt; normalizerPath = temp_path + &quot;/normalizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; normalizer.save(normalizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer = Normalizer.load(normalizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer.getP() == normalizer.getP()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">p</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;p&quot;</span><span class="p">,</span> <span class="s2">&quot;the p norm value.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Normalizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Normalizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Normalizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Normalizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Normalizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setP</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Normalizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`p`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.getP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.getP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getP</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of p or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">p</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Normalizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Normalizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_OneHotEncoderParams</span><span class="p">(</span>
<span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasHandleInvalid</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to handle invalid data during &quot;</span>
<span class="o">+</span> <span class="s2">&quot;transform(). Options are &#39;keep&#39; (invalid data presented as an extra &quot;</span>
<span class="o">+</span> <span class="s2">&quot;categorical feature) or error (throw an error). Note that this Param &quot;</span>
<span class="o">+</span> <span class="s2">&quot;is only used during transform; during fitting, invalid data will &quot;</span>
<span class="o">+</span> <span class="s2">&quot;result in an error.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;dropLast&quot;</span><span class="p">,</span>
<span class="s2">&quot;whether to drop the last category&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of dropLast or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span><span class="p">)</span>
<div class="viewcode-block" id="OneHotEncoder"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">OneHotEncoder</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">],</span>
<span class="n">_OneHotEncoderParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A one-hot encoder that maps a column of category indices to a column of binary vectors, with</span>
<span class="sd"> at most a single one-value per row that indicates the input category index.</span>
<span class="sd"> For example with 5 categories, an input value of 2.0 would map to an output vector of</span>
<span class="sd"> `[0.0, 0.0, 1.0, 0.0]`.</span>
<span class="sd"> The last category is not included by default (configurable via :py:attr:`dropLast`),</span>
<span class="sd"> because it makes the vector entries sum up to one, and hence linearly dependent.</span>
<span class="sd"> So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.</span>
<span class="sd"> When :py:attr:`handleInvalid` is configured to &#39;keep&#39;, an extra &quot;category&quot; indicating invalid</span>
<span class="sd"> values is added as last category. So when :py:attr:`dropLast` is true, invalid values are</span>
<span class="sd"> encoded as all-zeros vector.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is different from scikit-learn&#39;s OneHotEncoder, which keeps all categories.</span>
<span class="sd"> The output vectors are sparse.</span>
<span class="sd"> When encoding multi-column by using :py:attr:`inputCols` and</span>
<span class="sd"> :py:attr:`outputCols` params, input/output cols come in pairs, specified by the order in</span>
<span class="sd"> the arrays, and each pair is treated independently.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> StringIndexer : for converting categorical values into category indices</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], [&quot;input&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ohe = OneHotEncoder()</span>
<span class="sd"> &gt;&gt;&gt; ohe.setInputCols([&quot;input&quot;])</span>
<span class="sd"> OneHotEncoder...</span>
<span class="sd"> &gt;&gt;&gt; ohe.setOutputCols([&quot;output&quot;])</span>
<span class="sd"> OneHotEncoder...</span>
<span class="sd"> &gt;&gt;&gt; model = ohe.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCols([&quot;output&quot;])</span>
<span class="sd"> OneHotEncoderModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().output</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; single_col_ohe = OneHotEncoder(inputCol=&quot;input&quot;, outputCol=&quot;output&quot;)</span>
<span class="sd"> &gt;&gt;&gt; single_col_model = single_col_ohe.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; single_col_model.transform(df).head().output</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; ohePath = temp_path + &quot;/ohe&quot;</span>
<span class="sd"> &gt;&gt;&gt; ohe.save(ohePath)</span>
<span class="sd"> &gt;&gt;&gt; loadedOHE = OneHotEncoder.load(ohePath)</span>
<span class="sd"> &gt;&gt;&gt; loadedOHE.getInputCols() == ohe.getInputCols()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/ohe-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = OneHotEncoderModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.categorySizes == model.categorySizes</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCols=None, handleInvalid=&quot;error&quot;, dropLast=True, \</span>
<span class="sd"> inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">OneHotEncoder</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.OneHotEncoder&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="OneHotEncoder.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCols=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> dropLast=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this OneHotEncoder.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`dropLast`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">OneHotEncoderModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel">[docs]</a><span class="k">class</span> <span class="nc">OneHotEncoderModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`OneHotEncoder`.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="OneHotEncoderModel.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`dropLast`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;OneHotEncoderModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">categorySizes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Original number of categories for each feature being encoded.</span>
<span class="sd"> The array contains one value for each input column, in order.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;categorySizes&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PolynomialExpansion</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion</span>
<span class="sd"> &lt;http://en.wikipedia.org/wiki/Polynomial_expansion&gt;`_, &quot;In mathematics, an</span>
<span class="sd"> expansion of a product of sums expresses it as a sum of products by using the fact that</span>
<span class="sd"> multiplication distributes over addition&quot;. Take a 2-variable feature vector as an example:</span>
<span class="sd"> `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], [&quot;dense&quot;])</span>
<span class="sd"> &gt;&gt;&gt; px = PolynomialExpansion(degree=2)</span>
<span class="sd"> &gt;&gt;&gt; px.setInputCol(&quot;dense&quot;)</span>
<span class="sd"> PolynomialExpansion...</span>
<span class="sd"> &gt;&gt;&gt; px.setOutputCol(&quot;expanded&quot;)</span>
<span class="sd"> PolynomialExpansion...</span>
<span class="sd"> &gt;&gt;&gt; px.transform(df).head().expanded</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; px.setParams(outputCol=&quot;test&quot;).transform(df).head().test</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; polyExpansionPath = temp_path + &quot;/poly-expansion&quot;</span>
<span class="sd"> &gt;&gt;&gt; px.save(polyExpansionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPx = PolynomialExpansion.load(polyExpansionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPx.getDegree() == px.getDegree()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedPx.transform(df).take(1) == px.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">degree</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;degree&quot;</span><span class="p">,</span>
<span class="s2">&quot;the polynomial degree to expand (&gt;= 1)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PolynomialExpansion</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.PolynomialExpansion&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="PolynomialExpansion.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this PolynomialExpansion.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`degree`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.getDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.getDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of degree or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PolynomialExpansion&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="QuantileDiscretizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">QuantileDiscretizer</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">HasRelativeError</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :py:class:`QuantileDiscretizer` takes a column with continuous features and outputs a column</span>
<span class="sd"> with binned categorical features. The number of bins can be set using the :py:attr:`numBuckets`</span>
<span class="sd"> parameter. It is possible that the number of buckets used will be less than this value, for</span>
<span class="sd"> example, if there are too few distinct values of the input to create enough distinct quantiles.</span>
<span class="sd"> Since 3.0.0, :py:class:`QuantileDiscretizer` can map multiple columns at once by setting the</span>
<span class="sd"> :py:attr:`inputCols` parameter. If both of the :py:attr:`inputCol` and :py:attr:`inputCols`</span>
<span class="sd"> parameters are set, an Exception will be thrown. To specify the number of buckets for each</span>
<span class="sd"> column, the :py:attr:`numBucketsArray` parameter can be set, or if the number of buckets</span>
<span class="sd"> should be the same across columns, :py:attr:`numBuckets` can be set as a convenience.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> NaN handling: Note also that</span>
<span class="sd"> :py:class:`QuantileDiscretizer` will raise an error when it finds NaN values in the dataset,</span>
<span class="sd"> but the user can also choose to either keep or remove NaN values within the dataset by setting</span>
<span class="sd"> :py:attr:`handleInvalid` parameter. If the user chooses to keep NaN values, they will be</span>
<span class="sd"> handled specially and placed into their own bucket, for example, if 4 buckets are used, then</span>
<span class="sd"> non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].</span>
<span class="sd"> Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for</span>
<span class="sd"> :py:meth:`pyspark.sql.DataFrameStatFunctions.approxQuantile` for a detailed description).</span>
<span class="sd"> The precision of the approximation can be controlled with the</span>
<span class="sd"> :py:attr:`relativeError` parameter.</span>
<span class="sd"> The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(&quot;nan&quot;),), (float(&quot;nan&quot;),)]</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame(values, [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds1 = QuantileDiscretizer(inputCol=&quot;values&quot;, outputCol=&quot;buckets&quot;)</span>
<span class="sd"> &gt;&gt;&gt; qds1.setNumBuckets(2)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.setRelativeError(0.01)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.getRelativeError()</span>
<span class="sd"> 0.01</span>
<span class="sd"> &gt;&gt;&gt; bucketizer = qds1.fit(df1)</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;keep&quot;).fit(df1).transform(df1).count()</span>
<span class="sd"> 6</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;skip&quot;).fit(df1).transform(df1).count()</span>
<span class="sd"> 4</span>
<span class="sd"> &gt;&gt;&gt; splits = bucketizer.getSplits()</span>
<span class="sd"> &gt;&gt;&gt; splits[0]</span>
<span class="sd"> -inf</span>
<span class="sd"> &gt;&gt;&gt; print(&quot;%2.1f&quot; % round(splits[1], 1))</span>
<span class="sd"> 0.4</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.transform(df1).head()</span>
<span class="sd"> &gt;&gt;&gt; bucketed.buckets</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; quantileDiscretizerPath = temp_path + &quot;/quantile-discretizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; qds1.save(quantileDiscretizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedQds = QuantileDiscretizer.load(quantileDiscretizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedQds.getNumBuckets() == qds1.getNumBuckets()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; inputs = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, 1.5),</span>
<span class="sd"> ... (float(&quot;nan&quot;), float(&quot;nan&quot;)), (float(&quot;nan&quot;), float(&quot;nan&quot;))]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(inputs, [&quot;input1&quot;, &quot;input2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds2 = QuantileDiscretizer(relativeError=0.01, handleInvalid=&quot;error&quot;, numBuckets=2,</span>
<span class="sd"> ... inputCols=[&quot;input1&quot;, &quot;input2&quot;], outputCols=[&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds2.getRelativeError()</span>
<span class="sd"> 0.01</span>
<span class="sd"> &gt;&gt;&gt; qds2.setHandleInvalid(&quot;keep&quot;).fit(df2).transform(df2).show()</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> |input1|input2|output1|output2|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> | 0.1| 0.0| 0.0| 0.0|</span>
<span class="sd"> | 0.4| 1.0| 1.0| 1.0|</span>
<span class="sd"> | 1.2| 1.3| 1.0| 1.0|</span>
<span class="sd"> | 1.5| 1.5| 1.0| 1.0|</span>
<span class="sd"> | NaN| NaN| 2.0| 2.0|</span>
<span class="sd"> | NaN| NaN| 2.0| 2.0|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; qds3 = QuantileDiscretizer(relativeError=0.01, handleInvalid=&quot;error&quot;,</span>
<span class="sd"> ... numBucketsArray=[5, 10], inputCols=[&quot;input1&quot;, &quot;input2&quot;],</span>
<span class="sd"> ... outputCols=[&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds3.setHandleInvalid(&quot;skip&quot;).fit(df2).transform(df2).show()</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> |input1|input2|output1|output2|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> | 0.1| 0.0| 1.0| 1.0|</span>
<span class="sd"> | 0.4| 1.0| 2.0| 2.0|</span>
<span class="sd"> | 1.2| 1.3| 3.0| 3.0|</span>
<span class="sd"> | 1.5| 1.5| 4.0| 4.0|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">numBuckets</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;numBuckets&quot;</span><span class="p">,</span>
<span class="s2">&quot;Maximum number of buckets (quantiles, or &quot;</span>
<span class="o">+</span> <span class="s2">&quot;categories) into which data points are grouped. Must be &gt;= 2.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;how to handle invalid entries. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Options are skip (filter out rows with invalid values), &quot;</span>
<span class="o">+</span> <span class="s2">&quot;error (throw an error), or keep (keep invalid values in a special &quot;</span>
<span class="o">+</span> <span class="s2">&quot;additional bucket). Note that in the multiple columns &quot;</span>
<span class="o">+</span> <span class="s2">&quot;case, the invalid handling is applied to all columns. That said &quot;</span>
<span class="o">+</span> <span class="s2">&quot;for &#39;error&#39; it will throw an error if any invalids are found in &quot;</span>
<span class="o">+</span> <span class="s2">&quot;any columns, for &#39;skip&#39; it will skip rows with any invalids in &quot;</span>
<span class="o">+</span> <span class="s2">&quot;any columns, etc.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;numBucketsArray&quot;</span><span class="p">,</span>
<span class="s2">&quot;Array of number of buckets &quot;</span>
<span class="o">+</span> <span class="s2">&quot;(quantiles, or categories) into which data points are grouped. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;This is for multiple columns input. If transforming multiple &quot;</span>
<span class="o">+</span> <span class="s2">&quot;columns and numBucketsArray is not set, but numBuckets is set, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;then numBuckets will be applied across all columns.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, numBucketsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">QuantileDiscretizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.QuantileDiscretizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="QuantileDiscretizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, numBucketsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Set the params for the QuantileDiscretizer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numBuckets`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.getNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numBuckets or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBuckets</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numBucketsArray`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBucketsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.getNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numBucketsArray or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBucketsArray</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;QuantileDiscretizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Bucketizer</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Private method to convert the java_model to a Python model.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">isSet</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inputCol</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span>
<span class="n">splits</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()),</span>
<span class="n">inputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCol</span><span class="p">(),</span>
<span class="n">outputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">(),</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">(),</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">splitsArrayList</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplitsArray</span><span class="p">())]</span>
<span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span>
<span class="n">splitsArray</span><span class="o">=</span><span class="n">splitsArrayList</span><span class="p">,</span>
<span class="n">inputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCols</span><span class="p">(),</span>
<span class="n">outputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCols</span><span class="p">(),</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">(),</span>
<span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_RobustScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`RobustScaler` and :py:class:`RobustScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">lower</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;lower&quot;</span><span class="p">,</span>
<span class="s2">&quot;Lower quantile to calculate quantile range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">upper</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;upper&quot;</span><span class="p">,</span>
<span class="s2">&quot;Upper quantile to calculate quantile range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">withCentering</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;withCentering&quot;</span><span class="p">,</span>
<span class="s2">&quot;Whether to center data with median&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">withScaling</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;withScaling&quot;</span><span class="p">,</span>
<span class="s2">&quot;Whether to scale the data to &quot;</span> <span class="s2">&quot;quantile range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">lower</span><span class="o">=</span><span class="mf">0.25</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="mf">0.75</span><span class="p">,</span> <span class="n">withCentering</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withScaling</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLower</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of lower or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lower</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of upper or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">upper</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withCentering or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withCentering</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withScaling or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withScaling</span><span class="p">)</span>
<div class="viewcode-block" id="RobustScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RobustScaler</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;RobustScaler&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> RobustScaler removes the median and scales the data according to the quantile range.</span>
<span class="sd"> The quantile range is by default IQR (Interquartile Range, quantile range between the</span>
<span class="sd"> 1st quartile = 25th quantile and the 3rd quartile = 75th quantile) but can be configured.</span>
<span class="sd"> Centering and scaling happen independently on each feature by computing the relevant</span>
<span class="sd"> statistics on the samples in the training set. Median and quantile range are then</span>
<span class="sd"> stored to be used on later data using the transform method.</span>
<span class="sd"> Note that NaN values are ignored in the computation of medians and ranges.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.dense([0.0, 0.0]),),</span>
<span class="sd"> ... (1, Vectors.dense([1.0, -1.0]),),</span>
<span class="sd"> ... (2, Vectors.dense([2.0, -2.0]),),</span>
<span class="sd"> ... (3, Vectors.dense([3.0, -3.0]),),</span>
<span class="sd"> ... (4, Vectors.dense([4.0, -4.0]),),]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; scaler = RobustScaler()</span>
<span class="sd"> &gt;&gt;&gt; scaler.setInputCol(&quot;features&quot;)</span>
<span class="sd"> RobustScaler...</span>
<span class="sd"> &gt;&gt;&gt; scaler.setOutputCol(&quot;scaled&quot;)</span>
<span class="sd"> RobustScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = scaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> RobustScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.median</span>
<span class="sd"> DenseVector([2.0, -2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.range</span>
<span class="sd"> DenseVector([2.0, 2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[1].output</span>
<span class="sd"> DenseVector([0.5, -0.5])</span>
<span class="sd"> &gt;&gt;&gt; scalerPath = temp_path + &quot;/robust-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; scaler.save(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler = RobustScaler.load(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler.getWithCentering() == scaler.getWithCentering()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler.getWithScaling() == scaler.getWithScaling()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/robust-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = RobustScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.median == model.median</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.range == model.range</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">lower</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span>
<span class="n">upper</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.75</span><span class="p">,</span>
<span class="n">withCentering</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">withScaling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span>
<span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RobustScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RobustScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RobustScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">lower</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span>
<span class="n">upper</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.75</span><span class="p">,</span>
<span class="n">withCentering</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">withScaling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span>
<span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> Sets params for this RobustScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setLower"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setLower">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLower</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`lower`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">lower</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setUpper"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setUpper">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`upper`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">upper</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setWithCentering"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithCentering">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withCentering`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withCentering</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setWithScaling"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithScaling">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withScaling`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withScaling</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScalerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">RobustScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel">[docs]</a><span class="k">class</span> <span class="nc">RobustScalerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;RobustScalerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`RobustScaler`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="RobustScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RobustScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Median of the RobustScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;median&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">range</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Quantile range of the RobustScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;range&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RegexTokenizer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A regex based tokenizer that extracts tokens either by using the</span>
<span class="sd"> provided regex pattern (in Java dialect) to split the text</span>
<span class="sd"> (default) or repeatedly matching the regex (if gaps is false).</span>
<span class="sd"> Optional parameters also allow filtering tokens using a minimal</span>
<span class="sd"> length.</span>
<span class="sd"> It returns an array of strings that can be empty.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;A B c&quot;,)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer = RegexTokenizer()</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setInputCol(&quot;text&quot;)</span>
<span class="sd"> RegexTokenizer...</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setOutputCol(&quot;words&quot;)</span>
<span class="sd"> RegexTokenizer...</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change a parameter.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setParams(outputCol=&quot;tokens&quot;).transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify a parameter.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df, {reTokenizer.outputCol: &quot;words&quot;}).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; regexTokenizerPath = temp_path + &quot;/regex-tokenizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.save(regexTokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.getGaps() == reTokenizer.getGaps()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.transform(df).take(1) == reTokenizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">minTokenLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minTokenLength&quot;</span><span class="p">,</span>
<span class="s2">&quot;minimum token length (&gt;= 0)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">gaps</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;gaps&quot;</span><span class="p">,</span>
<span class="s2">&quot;whether regex splits on gaps (True) or matches tokens &quot;</span> <span class="o">+</span> <span class="s2">&quot;(False)&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">pattern</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;pattern&quot;</span><span class="p">,</span>
<span class="s2">&quot;regex pattern (Java dialect) used for tokenizing&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">toLowercase</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;toLowercase&quot;</span><span class="p">,</span>
<span class="s2">&quot;whether to convert all characters to &quot;</span> <span class="o">+</span> <span class="s2">&quot;lowercase before tokenizing&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minTokenLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">gaps</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">toLowercase</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minTokenLength=1, gaps=True, pattern=&quot;\\s+&quot;, inputCol=None, \</span>
<span class="sd"> outputCol=None, toLowercase=True)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RegexTokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RegexTokenizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span> <span class="n">toLowercase</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RegexTokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">minTokenLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">gaps</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">toLowercase</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minTokenLength=1, gaps=True, pattern=&quot;\\s+&quot;, inputCol=None, \</span>
<span class="sd"> outputCol=None, toLowercase=True)</span>
<span class="sd"> Sets params for this RegexTokenizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTokenLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minTokenLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`gaps`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">gaps</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of gaps or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">gaps</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`pattern`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">pattern</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of pattern or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pattern</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`toLowercase`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">toLowercase</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of toLowercase or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">toLowercase</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RegexTokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="SQLTransformer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">SQLTransformer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;SQLTransformer&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the transforms which are defined by SQL statement.</span>
<span class="sd"> Currently we only support SQL syntax like `SELECT ... FROM __THIS__`</span>
<span class="sd"> where `__THIS__` represents the underlying table of the input dataset.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], [&quot;id&quot;, &quot;v1&quot;, &quot;v2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans = SQLTransformer(</span>
<span class="sd"> ... statement=&quot;SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__&quot;)</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans.transform(df).head()</span>
<span class="sd"> Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)</span>
<span class="sd"> &gt;&gt;&gt; sqlTransformerPath = temp_path + &quot;/sql-transformer&quot;</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans.save(sqlTransformerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans = SQLTransformer.load(sqlTransformerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans.getStatement() == sqlTrans.getStatement()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans.transform(df).take(1) == sqlTrans.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">statement</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;statement&quot;</span><span class="p">,</span> <span class="s2">&quot;SQL statement&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, statement=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">SQLTransformer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.SQLTransformer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="SQLTransformer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SQLTransformer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, statement=None)</span>
<span class="sd"> Sets params for this SQLTransformer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="SQLTransformer.setStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SQLTransformer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`statement`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">statement</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="SQLTransformer.getStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.getStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of statement or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">statement</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_StandardScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`StandardScaler` and :py:class:`StandardScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">withMean</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;withMean&quot;</span><span class="p">,</span> <span class="s2">&quot;Center data with mean&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span>
<span class="p">)</span>
<span class="n">withStd</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;withStd&quot;</span><span class="p">,</span>
<span class="s2">&quot;Scale to unit standard deviation&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_StandardScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withMean or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withMean</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withStd or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withStd</span><span class="p">)</span>
<div class="viewcode-block" id="StandardScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">StandardScaler</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">],</span>
<span class="n">_StandardScalerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;StandardScaler&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Standardizes features by removing the mean and scaling to unit variance using column summary</span>
<span class="sd"> statistics on the samples in the training set.</span>
<span class="sd"> The &quot;unit std&quot; is computed using the `corrected sample standard deviation \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation&gt;`_,</span>
<span class="sd"> which is computed as the square root of the unbiased sample variance.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; standardScaler = StandardScaler()</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> StandardScaler...</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.setOutputCol(&quot;scaled&quot;)</span>
<span class="sd"> StandardScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = standardScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getInputCol()</span>
<span class="sd"> &#39;a&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> StandardScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.mean</span>
<span class="sd"> DenseVector([1.0])</span>
<span class="sd"> &gt;&gt;&gt; model.std</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[1].output</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> &gt;&gt;&gt; standardScalerPath = temp_path + &quot;/standard-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.save(standardScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler = StandardScaler.load(standardScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler.getWithMean() == standardScaler.getWithMean()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler.getWithStd() == standardScaler.getWithStd()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/standard-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = StandardScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.std == model.std</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.mean == model.mean</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">withMean</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">withStd</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StandardScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.StandardScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="StandardScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">withMean</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">withStd</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this StandardScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setWithMean"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithMean">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withMean`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setWithStd"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithStd">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withStd`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withStd</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScaler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">StandardScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel">[docs]</a><span class="k">class</span> <span class="nc">StandardScalerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_StandardScalerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`StandardScaler`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="StandardScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StandardScalerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Standard deviation of the StandardScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;std&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mean of the StandardScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_StringIndexerParams</span><span class="p">(</span>
<span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`StringIndexer` and :py:class:`StringIndexerModel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;stringOrderType&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to order labels of string column. The first label after &quot;</span>
<span class="o">+</span> <span class="s2">&quot;ordering is assigned an index of 0. Supported options: &quot;</span>
<span class="o">+</span> <span class="s2">&quot;frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Default is frequencyDesc. In case of equal frequency when &quot;</span>
<span class="o">+</span> <span class="s2">&quot;under frequencyDesc/Asc, the strings are further sorted &quot;</span>
<span class="o">+</span> <span class="s2">&quot;alphabetically&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;how to handle invalid data (unseen &quot;</span>
<span class="o">+</span> <span class="s2">&quot;or NULL values) in features and label column of string type. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Options are &#39;skip&#39; (filter out rows with invalid data), &quot;</span>
<span class="o">+</span> <span class="s2">&quot;error (throw an error), or &#39;keep&#39; (put invalid data &quot;</span>
<span class="o">+</span> <span class="s2">&quot;in a special additional bucket, at index numLabels).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stringOrderType` or its default value &#39;frequencyDesc&#39;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringOrderType</span><span class="p">)</span>
<div class="viewcode-block" id="StringIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">StringIndexer</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">],</span>
<span class="n">_StringIndexerParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;StringIndexer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A label indexer that maps a string column of labels to an ML column of label indices.</span>
<span class="sd"> If the input column is numeric, we cast it to string and index the string values.</span>
<span class="sd"> The indices are in [0, numLabels). By default, this is ordered by label frequencies</span>
<span class="sd"> so the most frequent label gets index 0. The ordering behavior is controlled by</span>
<span class="sd"> setting :py:attr:`stringOrderType`. Its default value is &#39;frequencyDesc&#39;.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;,</span>
<span class="sd"> ... stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> StringIndexer...</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; model.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> StringIndexerModel...</span>
<span class="sd"> &gt;&gt;&gt; td = model.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span>
<span class="sd"> &gt;&gt;&gt; inverter = IndexToString(inputCol=&quot;indexed&quot;, outputCol=&quot;label2&quot;, labels=model.labels)</span>
<span class="sd"> &gt;&gt;&gt; itd = inverter.transform(td)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, &#39;a&#39;), (1, &#39;b&#39;), (2, &#39;c&#39;), (3, &#39;a&#39;), (4, &#39;a&#39;), (5, &#39;c&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; stringIndexerPath = temp_path + &quot;/string-indexer&quot;</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.save(stringIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer = StringIndexer.load(stringIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/string-indexer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = StringIndexerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.labels == model.labels</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; indexToStringPath = temp_path + &quot;/index-to-string&quot;</span>
<span class="sd"> &gt;&gt;&gt; inverter.save(indexToStringPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInverter = IndexToString.load(indexToStringPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInverter.getLabels() == inverter.getLabels()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.getStringOrderType()</span>
<span class="sd"> &#39;frequencyDesc&#39;</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;, handleInvalid=&quot;error&quot;,</span>
<span class="sd"> ... stringOrderType=&quot;alphabetDesc&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; td = model.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; fromlabelsModel = StringIndexerModel.from_labels([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],</span>
<span class="sd"> ... inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result = fromlabelsModel.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)]</span>
<span class="sd"> &gt;&gt;&gt; testData = sc.parallelize([Row(id=0, label1=&quot;a&quot;, label2=&quot;e&quot;),</span>
<span class="sd"> ... Row(id=1, label1=&quot;b&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=2, label1=&quot;c&quot;, label2=&quot;e&quot;),</span>
<span class="sd"> ... Row(id=3, label1=&quot;a&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=4, label1=&quot;a&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=5, label1=&quot;c&quot;, label2=&quot;f&quot;)], 3)</span>
<span class="sd"> &gt;&gt;&gt; multiRowDf = spark.createDataFrame(testData)</span>
<span class="sd"> &gt;&gt;&gt; inputs = [&quot;label1&quot;, &quot;label2&quot;]</span>
<span class="sd"> &gt;&gt;&gt; outputs = [&quot;index1&quot;, &quot;index2&quot;]</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; result = model.transform(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span>
<span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; fromlabelsModel = StringIndexerModel.from_arrays_of_labels([[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], [&quot;e&quot;, &quot;f&quot;]],</span>
<span class="sd"> ... inputCols=inputs, outputCols=outputs)</span>
<span class="sd"> &gt;&gt;&gt; result = fromlabelsModel.transform(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span>
<span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 1.0), (5, 2.0, 1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StringIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.StringIndexer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="StringIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> Sets params for this StringIndexer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">StringIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<div class="viewcode-block" id="StringIndexer.setStringOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setStringOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stringOrderType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="StringIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">StringIndexerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`StringIndexer`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="StringIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.from_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_labels">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_labels</span><span class="p">(</span>
<span class="bp">cls</span><span class="p">,</span>
<span class="n">labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from an array of label strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">labels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.StringIndexerModel&quot;</span><span class="p">,</span> <span class="n">jlabels</span>
<span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span>
<span class="k">return</span> <span class="n">model</span></div>
<div class="viewcode-block" id="StringIndexerModel.from_arrays_of_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_arrays_of_labels">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_arrays_of_labels</span><span class="p">(</span>
<span class="bp">cls</span><span class="p">,</span>
<span class="n">arrayOfLabels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StringIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from an array of array of label strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">arrayOfLabels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.StringIndexerModel&quot;</span><span class="p">,</span> <span class="n">jlabels</span>
<span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCols</span><span class="p">(</span><span class="n">inputCols</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCols</span><span class="p">(</span><span class="n">outputCols</span><span class="p">)</span>
<span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span>
<span class="k">return</span> <span class="n">model</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">labels</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Ordered list of labels, corresponding to indices to be assigned.</span>
<span class="sd"> .. deprecated:: 3.1.0</span>
<span class="sd"> It will be removed in future versions. Use `labelsArray` method instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;labels&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.2&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">labelsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Array of ordered list of labels, corresponding to indices to be assigned</span>
<span class="sd"> for each input column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;labelsArray&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">IndexToString</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;IndexToString&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A :py:class:`pyspark.ml.base.Transformer` that maps a column of indices back to a new column of</span>
<span class="sd"> corresponding string values.</span>
<span class="sd"> The index-string mapping is either from the ML attributes of the input column,</span>
<span class="sd"> or from user-supplied labels (which take precedence over ML attributes).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> StringIndexer : for converting categorical values into category indices</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">labels</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;labels&quot;</span><span class="p">,</span>
<span class="s2">&quot;Optional array of labels specifying index-string mapping.&quot;</span>
<span class="o">+</span> <span class="s2">&quot; If not provided or if empty, then metadata from inputCol is used instead.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, labels=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">IndexToString</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.IndexToString&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="IndexToString.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IndexToString&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, labels=None)</span>
<span class="sd"> Sets params for this IndexToString.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;IndexToString&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labels`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labels</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.getLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.getLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`labels` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IndexToString&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;IndexToString&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="StopWordsRemover"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover">[docs]</a><span class="k">class</span> <span class="nc">StopWordsRemover</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that filters out stop words from input.</span>
<span class="sd"> Since 3.0.0, :py:class:`StopWordsRemover` can filter out multiple columns at once by setting</span>
<span class="sd"> the :py:attr:`inputCols` parameter. Note that when both the :py:attr:`inputCol` and</span>
<span class="sd"> :py:attr:`inputCols` parameters are set, an Exception will be thrown.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> null values from input array are preserved unless adding null to stopWords explicitly.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover = StopWordsRemover(stopWords=[&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover.setInputCol(&quot;text&quot;)</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover.setOutputCol(&quot;words&quot;)</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover.transform(df).head().words == [&#39;a&#39;, &#39;c&#39;]</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; stopWordsRemoverPath = temp_path + &quot;/stopwords-remover&quot;</span>
<span class="sd"> &gt;&gt;&gt; remover.save(stopWordsRemoverPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover = StopWordsRemover.load(stopWordsRemoverPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.getStopWords() == remover.getStopWords()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.getCaseSensitive() == remover.getCaseSensitive()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.transform(df).take(1) == remover.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], [&quot;a&quot;, &quot;b&quot;])], [&quot;text1&quot;, &quot;text2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover2 = StopWordsRemover(stopWords=[&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover2.setInputCols([&quot;text1&quot;, &quot;text2&quot;]).setOutputCols([&quot;words1&quot;, &quot;words2&quot;])</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover2.transform(df2).show()</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> | text1| text2|words1|words2|</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> |[a, b, c]|[a, b]|[a, c]| [a]|</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;stopWords&quot;</span><span class="p">,</span>
<span class="s2">&quot;The words to be filtered out&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;caseSensitive&quot;</span><span class="p">,</span>
<span class="s2">&quot;whether to do a case sensitive &quot;</span> <span class="o">+</span> <span class="s2">&quot;comparison over the stop words&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;locale&quot;</span><span class="p">,</span>
<span class="s2">&quot;locale of the input. ignored when case sensitive &quot;</span> <span class="o">+</span> <span class="s2">&quot;is true&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">):</span>
<span class="o">...</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span>
<span class="sd"> locale=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StopWordsRemover</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.StopWordsRemover&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">stopWords</span><span class="o">=</span><span class="n">StopWordsRemover</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="s2">&quot;english&quot;</span><span class="p">),</span>
<span class="n">caseSensitive</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">locale</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">getLocale</span><span class="p">(),</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="StopWordsRemover.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span>
<span class="sd"> locale=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this StopWordRemover.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stopWords`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stopWords</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stopWords` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stopWords</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`caseSensitive`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">caseSensitive</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`caseSensitive` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">caseSensitive</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`locale`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">locale</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`locale`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">locale</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;StopWordsRemover&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.loadDefaultStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.loadDefaultStopWords">[docs]</a> <span class="nd">@staticmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Loads the default stop words for the given language.</span>
<span class="sd"> Supported languages: danish, dutch, english, finnish, french, german, hungarian,</span>
<span class="sd"> italian, norwegian, portuguese, russian, spanish, swedish, turkish</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stopWordsObj</span> <span class="o">=</span> <span class="n">_jvm</span><span class="p">()</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="n">StopWordsRemover</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">stopWordsObj</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">))</span></div></div>
<div class="viewcode-block" id="Tokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Tokenizer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Tokenizer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A tokenizer that converts the input string to lowercase and then</span>
<span class="sd"> splits it by white spaces.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a b c&quot;,)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; tokenizer = Tokenizer(outputCol=&quot;words&quot;)</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setInputCol(&quot;text&quot;)</span>
<span class="sd"> Tokenizer...</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change a parameter.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setParams(outputCol=&quot;tokens&quot;).transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify a parameter.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df, {tokenizer.outputCol: &quot;words&quot;}).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; tokenizerPath = temp_path + &quot;/tokenizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.save(tokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedTokenizer = Tokenizer.load(tokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Tokenizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Tokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Tokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Tokenizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Tokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Tokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Tokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Tokenizer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="VectorAssembler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorAssembler</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VectorAssembler&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that merges multiple columns into a vector column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 0, 3)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler = VectorAssembler(outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.setInputCols([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> VectorAssembler...</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.transform(df).head().features</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {vecAssembler.inputCols: [&quot;b&quot;, &quot;a&quot;], vecAssembler.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorAssemblerPath = temp_path + &quot;/vector-assembler&quot;</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.save(vectorAssemblerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedAssembler = VectorAssembler.load(vectorAssemblerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedAssembler.transform(df).head().freqs == vecAssembler.transform(df).head().freqs</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; dfWithNullsAndNaNs = spark.createDataFrame(</span>
<span class="sd"> ... [(1.0, 2.0, None), (3.0, float(&quot;nan&quot;), 4.0), (5.0, 6.0, 7.0)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2 = VectorAssembler(inputCols=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], outputCol=&quot;features&quot;,</span>
<span class="sd"> ... handleInvalid=&quot;keep&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2.transform(dfWithNullsAndNaNs).show()</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> | a| b| c| features|</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> |1.0|2.0|NULL|[1.0,2.0,NaN]|</span>
<span class="sd"> |3.0|NaN| 4.0|[3.0,NaN,4.0]|</span>
<span class="sd"> |5.0|6.0| 7.0|[5.0,6.0,7.0]|</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2.setParams(handleInvalid=&quot;skip&quot;).transform(dfWithNullsAndNaNs).show()</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> | a| b| c| features|</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> |5.0|6.0|7.0|[5.0,6.0,7.0]|</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to handle invalid data (NULL &quot;</span>
<span class="o">+</span> <span class="s2">&quot;and NaN values). Options are &#39;skip&#39; (filter out rows with invalid &quot;</span>
<span class="o">+</span> <span class="s2">&quot;data), &#39;error&#39; (throw an error), or &#39;keep&#39; (return relevant number &quot;</span>
<span class="o">+</span> <span class="s2">&quot;of NaN in the output). Column lengths are taken from the size of ML &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Attribute Group, which can be set using `VectorSizeHint` in a &quot;</span>
<span class="o">+</span> <span class="s2">&quot;pipeline before `VectorAssembler`. Column lengths can also be &quot;</span>
<span class="o">+</span> <span class="s2">&quot;inferred from first rows of the data since it is safe to do so but &quot;</span>
<span class="o">+</span> <span class="s2">&quot;only in case of &#39;error&#39; or &#39;skip&#39;).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorAssembler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorAssembler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorAssembler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorAssembler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorAssembler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorAssembler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorAssembler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorAssembler&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_VectorIndexerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`VectorIndexer` and :py:class:`VectorIndexerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">maxCategories</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;maxCategories&quot;</span><span class="p">,</span>
<span class="s2">&quot;Threshold for the number of values a categorical feature can take &quot;</span>
<span class="o">+</span> <span class="s2">&quot;(&gt;= 2). If a feature is found to have &gt; maxCategories values, then &quot;</span>
<span class="o">+</span> <span class="s2">&quot;it is declared continuous.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to handle invalid data &quot;</span>
<span class="o">+</span> <span class="s2">&quot;(unseen labels or NULL values). Options are &#39;skip&#39; (filter out &quot;</span>
<span class="o">+</span> <span class="s2">&quot;rows with invalid data), &#39;error&#39; (throw an error), or &#39;keep&#39; (put &quot;</span>
<span class="o">+</span> <span class="s2">&quot;invalid data in a special additional bucket, at index of the number &quot;</span>
<span class="o">+</span> <span class="s2">&quot;of categories of the feature).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxCategories or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">)</span>
<div class="viewcode-block" id="VectorIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorIndexer</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">],</span>
<span class="n">_VectorIndexerParams</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VectorIndexer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Class for indexing categorical feature columns in a dataset of `Vector`.</span>
<span class="sd"> This has 2 usage modes:</span>
<span class="sd"> - Automatically identify categorical features (default behavior)</span>
<span class="sd"> - This helps process a dataset of unknown vectors into a dataset with some continuous</span>
<span class="sd"> features and some categorical features. The choice between continuous and categorical</span>
<span class="sd"> is based upon a maxCategories parameter.</span>
<span class="sd"> - Set maxCategories to the maximum number of categorical any categorical feature should</span>
<span class="sd"> have.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},</span>
<span class="sd"> and feature 1 will be declared continuous.</span>
<span class="sd"> - Index all features, if all features are categorical</span>
<span class="sd"> - If maxCategories is set to be very large, then this will build an index of unique</span>
<span class="sd"> values for all features.</span>
<span class="sd"> - Warning: This can cause problems if features are continuous since this will collect ALL</span>
<span class="sd"> unique values to the driver.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories &gt;= 3, then both features will be declared categorical.</span>
<span class="sd"> This returns a model which can transform categorical features to use 0-based indices.</span>
<span class="sd"> Index stability:</span>
<span class="sd"> - This is not guaranteed to choose the same category index across multiple runs.</span>
<span class="sd"> - If a categorical feature includes value 0, then this is guaranteed to map value 0 to</span>
<span class="sd"> index 0. This maintains vector sparsity.</span>
<span class="sd"> - More stability may be added in the future.</span>
<span class="sd"> TODO: Future extensions: The following functionality is planned for the future:</span>
<span class="sd"> - Preserve metadata in transform; if a feature&#39;s metadata is already present,</span>
<span class="sd"> do not recompute.</span>
<span class="sd"> - Specify certain features to not index, either via a parameter or via existing metadata.</span>
<span class="sd"> - Add warning if a categorical feature has only 1 category.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; indexer = VectorIndexer(maxCategories=2, inputCol=&quot;a&quot;)</span>
<span class="sd"> &gt;&gt;&gt; indexer.setOutputCol(&quot;indexed&quot;)</span>
<span class="sd"> VectorIndexer...</span>
<span class="sd"> &gt;&gt;&gt; model = indexer.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; indexer.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> VectorIndexerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().output</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.numFeatures</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.categoryMaps</span>
<span class="sd"> {0: {0.0: 0, -1.0: 1}}</span>
<span class="sd"> &gt;&gt;&gt; indexer.setParams(outputCol=&quot;test&quot;).fit(df).transform(df).collect()[1].test</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {indexer.maxCategories: 3, indexer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; model2 = indexer.fit(df, params)</span>
<span class="sd"> &gt;&gt;&gt; model2.transform(df).head().vector</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorIndexerPath = temp_path + &quot;/vector-indexer&quot;</span>
<span class="sd"> &gt;&gt;&gt; indexer.save(vectorIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer = VectorIndexer.load(vectorIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer.getMaxCategories() == indexer.getMaxCategories()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/vector-indexer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = VectorIndexerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.numFeatures == model.numFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.categoryMaps == model.categoryMaps</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; indexer.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model3 = indexer.setHandleInvalid(&quot;skip&quot;).fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model3.transform(dfWithInvalid).count()</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; model4 = indexer.setParams(handleInvalid=&quot;keep&quot;, outputCol=&quot;indexed&quot;).fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model4.transform(dfWithInvalid).head().indexed</span>
<span class="sd"> DenseVector([2.0, 1.0])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">maxCategories</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorIndexer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">maxCategories</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorIndexer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setMaxCategories"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setMaxCategories">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxCategories`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">VectorIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">VectorIndexerModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span> <span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`VectorIndexer`.</span>
<span class="sd"> Transform categorical features to use 0-based indices instead of their original values.</span>
<span class="sd"> - Categorical features are mapped to indices.</span>
<span class="sd"> - Continuous features (columns) are left unchanged.</span>
<span class="sd"> This also appends metadata to the output column, marking features as Numeric (continuous),</span>
<span class="sd"> Nominal (categorical), or Binary (either continuous or categorical).</span>
<span class="sd"> Non-ML metadata is not carried over from the input to the output column.</span>
<span class="sd"> This maintains vector sparsity.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="VectorIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorIndexerModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">numFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Number of features, i.e., length of Vectors which this transforms.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;numFeatures&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">categoryMaps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature value index. Keys are categorical feature indices (column indices).</span>
<span class="sd"> Values are maps from original features values to 0-based category indices.</span>
<span class="sd"> If a feature is not in this map, it is treated as continuous.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;javaCategoryMaps&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorSlicer</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VectorSlicer&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This class takes a feature vector and outputs a new feature vector with a subarray</span>
<span class="sd"> of the original features.</span>
<span class="sd"> The subset of features can be specified with either indices (`setIndices()`)</span>
<span class="sd"> or names (`setNames()`). At least one feature must be selected. Duplicate features</span>
<span class="sd"> are not allowed, so there can be no overlap between selected indices and names.</span>
<span class="sd"> The output vector will order features with the selected indices first (in the order given),</span>
<span class="sd"> followed by the selected names (in the order given).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], [&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vs = VectorSlicer(outputCol=&quot;sliced&quot;, indices=[1, 4])</span>
<span class="sd"> &gt;&gt;&gt; vs.setInputCol(&quot;features&quot;)</span>
<span class="sd"> VectorSlicer...</span>
<span class="sd"> &gt;&gt;&gt; vs.transform(df).head().sliced</span>
<span class="sd"> DenseVector([2.3, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorSlicerPath = temp_path + &quot;/vector-slicer&quot;</span>
<span class="sd"> &gt;&gt;&gt; vs.save(vectorSlicerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedVs = VectorSlicer.load(vectorSlicerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.getIndices() == vs.getIndices()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.getNames() == vs.getNames()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.transform(df).take(1) == vs.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">indices</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;indices&quot;</span><span class="p">,</span>
<span class="s2">&quot;An array of indices to select features from &quot;</span>
<span class="o">+</span> <span class="s2">&quot;a vector column. There can be no overlap with names.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;names&quot;</span><span class="p">,</span>
<span class="s2">&quot;An array of feature names to select features from &quot;</span>
<span class="o">+</span> <span class="s2">&quot;a vector column. These names must be specified by ML &quot;</span>
<span class="o">+</span> <span class="s2">&quot;org.apache.spark.ml.attribute.Attribute. There can be no overlap with &quot;</span>
<span class="o">+</span> <span class="s2">&quot;indices.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">indices</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, indices=None, names=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorSlicer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorSlicer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="p">[],</span> <span class="n">names</span><span class="o">=</span><span class="p">[])</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorSlicer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">indices</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSlicer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, indices=None, names=None):</span>
<span class="sd"> Sets params for this VectorSlicer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSlicer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`indices`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.getIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of indices or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">indices</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNames</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSlicer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`names`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">names</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.getNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNames</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of names or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">names</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSlicer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSlicer&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_Word2VecParams</span><span class="p">(</span><span class="n">HasStepSize</span><span class="p">,</span> <span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Word2Vec` and :py:class:`Word2VecModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">vectorSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;vectorSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;the dimension of codes after transforming from words&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">numPartitions</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;numPartitions&quot;</span><span class="p">,</span>
<span class="s2">&quot;number of partitions for sentences of words&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">minCount</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minCount&quot;</span><span class="p">,</span>
<span class="s2">&quot;the minimum number of times a token must appear to be included in the &quot;</span>
<span class="o">+</span> <span class="s2">&quot;word2vec model&#39;s vocabulary&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">windowSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;windowSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;the window size (context words from [-window, window]). Default value is 5&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">maxSentenceLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;maxSentenceLength&quot;</span><span class="p">,</span>
<span class="s2">&quot;Maximum length (in words) of each sentence in the input data. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Any sentence longer than this threshold will &quot;</span>
<span class="o">+</span> <span class="s2">&quot;be divided into chunks up to the size.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_Word2VecParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span>
<span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">windowSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="n">maxSentenceLength</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of vectorSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numPartitions or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minCount or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minCount</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of windowSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">windowSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxSentenceLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxSentenceLength</span><span class="p">)</span>
<div class="viewcode-block" id="Word2Vec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Word2Vec</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;Word2VecModel&quot;</span><span class="p">],</span>
<span class="n">_Word2VecParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Word2Vec&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further</span>
<span class="sd"> natural language processing or machine learning process.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; sent = (&quot;a b &quot; * 100 + &quot;a c &quot; * 10).split(&quot; &quot;)</span>
<span class="sd"> &gt;&gt;&gt; doc = spark.createDataFrame([(sent,), (sent,)], [&quot;sentence&quot;])</span>
<span class="sd"> &gt;&gt;&gt; word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol=&quot;sentence&quot;, outputCol=&quot;model&quot;)</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.setMaxIter(10)</span>
<span class="sd"> Word2Vec...</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.getMaxIter()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.clear(word2Vec.maxIter)</span>
<span class="sd"> &gt;&gt;&gt; model = word2Vec.fit(doc)</span>
<span class="sd"> &gt;&gt;&gt; model.getMinCount()</span>
<span class="sd"> 5</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;sentence&quot;)</span>
<span class="sd"> Word2VecModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getVectors().show()</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> |word| vector|</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> | a|[0.0951...</span>
<span class="sd"> | b|[-1.202...</span>
<span class="sd"> | c|[0.3015...</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.findSynonymsArray(&quot;a&quot;, 2)</span>
<span class="sd"> [(&#39;b&#39;, 0.015859...), (&#39;c&#39;, -0.568079...)]</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import format_number as fmt</span>
<span class="sd"> &gt;&gt;&gt; model.findSynonyms(&quot;a&quot;, 2).select(&quot;word&quot;, fmt(&quot;similarity&quot;, 5).alias(&quot;similarity&quot;)).show()</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> |word|similarity|</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> | b| 0.01586|</span>
<span class="sd"> | c| -0.56808|</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(doc).head().model</span>
<span class="sd"> DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769])</span>
<span class="sd"> &gt;&gt;&gt; word2vecPath = temp_path + &quot;/word2vec&quot;</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.save(word2vecPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec = Word2Vec.load(word2vecPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getMinCount() == word2Vec.getMinCount()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/word2vec-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = Word2VecModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.getVectors().first().word == model.getVectors().first().word</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.getVectors().first().vector == model.getVectors().first().vector</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(doc).take(1) == model.transform(doc).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">vectorSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span>
<span class="n">minCount</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
<span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">stepSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.025</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">windowSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
<span class="n">maxSentenceLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, \</span>
<span class="sd"> maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, \</span>
<span class="sd"> maxSentenceLength=1000)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Word2Vec</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Word2Vec&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Word2Vec.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">vectorSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span>
<span class="n">minCount</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
<span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">stepSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.025</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">windowSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
<span class="n">maxSentenceLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \</span>
<span class="sd"> seed=None, inputCol=None, outputCol=None, windowSize=5, \</span>
<span class="sd"> maxSentenceLength=1000)</span>
<span class="sd"> Sets params for this Word2Vec.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setVectorSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setVectorSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`vectorSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setNumPartitions"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setNumPartitions">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numPartitions`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numPartitions</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMinCount"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMinCount">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minCount`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minCount</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setWindowSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setWindowSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`windowSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">windowSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMaxSentenceLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxSentenceLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxSentenceLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxSentenceLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxIter">[docs]</a> <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setStepSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setStepSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStepSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2Vec&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stepSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stepSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2VecModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Word2VecModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel">[docs]</a><span class="k">class</span> <span class="nc">Word2VecModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_Word2VecParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;Word2VecModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`Word2Vec`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="Word2VecModel.getVectors"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.getVectors">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVectors</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the vector representation of the words as a dataframe</span>
<span class="sd"> with two fields, word and vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;getVectors&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2VecModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Word2VecModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.findSynonyms"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonyms">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">findSynonyms</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Vector</span><span class="p">],</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Find &quot;num&quot; number of words closest in similarity to &quot;word&quot;.</span>
<span class="sd"> word can be a string or vector representation.</span>
<span class="sd"> Returns a dataframe with two fields word and similarity (which</span>
<span class="sd"> gives the cosine similarity).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;findSynonyms&quot;</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.findSynonymsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonymsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">findSynonymsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Vector</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Find &quot;num&quot; number of words closest in similarity to &quot;word&quot;.</span>
<span class="sd"> word can be a string or vector representation.</span>
<span class="sd"> Returns an array with two fields word and similarity (which</span>
<span class="sd"> gives the cosine similarity).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">tuples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">findSynonymsArray</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">st</span><span class="p">:</span> <span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">_1</span><span class="p">(),</span> <span class="n">st</span><span class="o">.</span><span class="n">_2</span><span class="p">()),</span> <span class="nb">list</span><span class="p">(</span><span class="n">tuples</span><span class="p">)))</span></div></div>
<span class="k">class</span> <span class="nc">_PCAParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`PCA` and :py:class:`PCAModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;the number of principal components&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of k or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<div class="viewcode-block" id="PCA"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PCA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;PCAModel&quot;</span><span class="p">],</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;PCA&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> PCA trains a model to project vectors to a lower dimensional space of the</span>
<span class="sd"> top :py:attr:`k` principal components.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),</span>
<span class="sd"> ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),</span>
<span class="sd"> ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data,[&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; pca = PCA(k=2, inputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; pca.setOutputCol(&quot;pca_features&quot;)</span>
<span class="sd"> PCA...</span>
<span class="sd"> &gt;&gt;&gt; model = pca.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getK()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> PCAModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[0].output</span>
<span class="sd"> DenseVector([1.648..., -4.013...])</span>
<span class="sd"> &gt;&gt;&gt; model.explainedVariance</span>
<span class="sd"> DenseVector([0.794..., 0.205...])</span>
<span class="sd"> &gt;&gt;&gt; pcaPath = temp_path + &quot;/pca&quot;</span>
<span class="sd"> &gt;&gt;&gt; pca.save(pcaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPca = PCA.load(pcaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPca.getK() == pca.getK()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/pca-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = PCAModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.pc == model.pc</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.explainedVariance == model.explainedVariance</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PCA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.PCA&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="PCA.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Set params for this PCA.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCAModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">PCAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel">[docs]</a><span class="k">class</span> <span class="nc">PCAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;PCAModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="PCAModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCAModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCAModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PCAModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DenseMatrix</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a principal components Matrix.</span>
<span class="sd"> Each column is one principal component.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;pc&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">explainedVariance</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DenseVector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a vector of proportions of variance</span>
<span class="sd"> explained by each principal component.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;explainedVariance&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_RFormulaParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`RFormula` and :py:class:`RFormula`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">formula</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;formula&quot;</span><span class="p">,</span> <span class="s2">&quot;R model formula&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span>
<span class="p">)</span>
<span class="n">forceIndexLabel</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;forceIndexLabel&quot;</span><span class="p">,</span>
<span class="s2">&quot;Force to index label whether it is numeric or string&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;stringIndexerOrderType&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to order categories of a string feature column used by &quot;</span>
<span class="o">+</span> <span class="s2">&quot;StringIndexer. The last category after ordering is dropped &quot;</span>
<span class="o">+</span> <span class="s2">&quot;when encoding strings. Supported options: frequencyDesc, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;frequencyAsc, alphabetDesc, alphabetAsc. The default value &quot;</span>
<span class="o">+</span> <span class="s2">&quot;is frequencyDesc. When the ordering is set to alphabetDesc, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;RFormula drops the same category as R when encoding strings.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;how to handle invalid entries. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Options are &#39;skip&#39; (filter out rows with invalid values), &quot;</span>
<span class="o">+</span> <span class="s2">&quot;&#39;error&#39; (throw an error), or &#39;keep&#39; (put invalid data in a special &quot;</span>
<span class="o">+</span> <span class="s2">&quot;additional bucket, at index numLabels).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_RFormulaParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">forceIndexLabel</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`formula`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`forceIndexLabel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">forceIndexLabel</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stringIndexerOrderType` or its default value &#39;frequencyDesc&#39;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringIndexerOrderType</span><span class="p">)</span>
<div class="viewcode-block" id="RFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RFormula</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;RFormulaModel&quot;</span><span class="p">],</span>
<span class="n">_RFormulaParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;RFormula&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the transforms required for fitting a dataset against an</span>
<span class="sd"> R model formula. Currently we support a limited subset of the R</span>
<span class="sd"> operators, including &#39;~&#39;, &#39;.&#39;, &#39;:&#39;, &#39;+&#39;, &#39;-&#39;, &#39;*&#39;, and &#39;^&#39;.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Also see the `R formula docs</span>
<span class="sd"> &lt;http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html&gt;`_.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (1.0, 1.0, &quot;a&quot;),</span>
<span class="sd"> ... (0.0, 2.0, &quot;b&quot;),</span>
<span class="sd"> ... (0.0, 0.0, &quot;a&quot;)</span>
<span class="sd"> ... ], [&quot;y&quot;, &quot;x&quot;, &quot;s&quot;])</span>
<span class="sd"> &gt;&gt;&gt; rf = RFormula(formula=&quot;y ~ x + s&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = rf.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getLabelCol()</span>
<span class="sd"> &#39;label&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> | y| x| s| features|label|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; rf.fit(df, {rf.formula: &quot;y ~ . - s&quot;}).transform(df).show()</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> | y| x| s|features|label|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> |1.0|1.0| a| [1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b| [2.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a| [0.0]| 0.0|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; rFormulaPath = temp_path + &quot;/rFormula&quot;</span>
<span class="sd"> &gt;&gt;&gt; rf.save(rFormulaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRF = RFormula.load(rFormulaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getFormula() == rf.getFormula()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getFeaturesCol() == rf.getFeaturesCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getLabelCol() == rf.getLabelCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getHandleInvalid() == rf.getHandleInvalid()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; str(loadedRF)</span>
<span class="sd"> &#39;RFormula(y ~ x + s) (uid=...)&#39;</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/rFormulaModel&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = RFormulaModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.uid == model.uid</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).show()</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> | y| x| s| features|label|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; str(loadedModel)</span>
<span class="sd"> &#39;RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">formula</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">forceIndexLabel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, formula=None, featuresCol=&quot;features&quot;, labelCol=&quot;label&quot;, \</span>
<span class="sd"> forceIndexLabel=False, stringIndexerOrderType=&quot;frequencyDesc&quot;, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RFormula</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RFormula&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RFormula.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">formula</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">forceIndexLabel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, formula=None, featuresCol=&quot;features&quot;, labelCol=&quot;label&quot;, \</span>
<span class="sd"> forceIndexLabel=False, stringIndexerOrderType=&quot;frequencyDesc&quot;, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for RFormula.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFormula">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`formula`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">formula</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setForceIndexLabel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setForceIndexLabel">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`forceIndexLabel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">forceIndexLabel</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setStringIndexerOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setStringIndexerOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stringIndexerOrderType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormula&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RFormulaModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">RFormulaModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="n">formulaStr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getFormula</span><span class="p">()</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">isDefined</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span> <span class="k">else</span> <span class="s2">&quot;&quot;</span>
<span class="k">return</span> <span class="s2">&quot;RFormula(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">formulaStr</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormulaModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormulaModel.html#pyspark.ml.feature.RFormulaModel">[docs]</a><span class="k">class</span> <span class="nc">RFormulaModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RFormulaParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;RFormulaModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`RFormula`. Fitting is required to determine the</span>
<span class="sd"> factor levels of formula terms.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="n">resolvedFormula</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;resolvedFormula&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="s2">&quot;RFormulaModel(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">resolvedFormula</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_SelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Selector` and :py:class:`SelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">selectorType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;selectorType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The selector type. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">numTopFeatures</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="s2">&quot;Number of features that selector will select, ordered by ascending p-value. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;If the number of features is &lt; numTopFeatures, then this will select &quot;</span>
<span class="o">+</span> <span class="s2">&quot;all features.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">percentile</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;percentile&quot;</span><span class="p">,</span>
<span class="s2">&quot;Percentile of features that selector &quot;</span> <span class="o">+</span> <span class="s2">&quot;will select, ordered by ascending p-value.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">fpr</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;fpr&quot;</span><span class="p">,</span>
<span class="s2">&quot;The highest p-value for features to be kept.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">fdr</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;fdr&quot;</span><span class="p">,</span>
<span class="s2">&quot;The upper bound of the expected false discovery rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">fwe</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;fwe&quot;</span><span class="p">,</span>
<span class="s2">&quot;The upper bound of the expected family-wise error rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_SelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span>
<span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="n">percentile</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
<span class="n">fpr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="n">fdr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="n">fwe</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectorType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectorType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numTopFeatures or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numTopFeatures</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of percentile or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">percentile</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fpr or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fpr</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fdr or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fdr</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fwe or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fwe</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_Selector</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">JM</span><span class="p">],</span> <span class="n">_SelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">JM</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Selectors.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectorType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectorType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numTopFeatures`.</span>
<span class="sd"> Only applicable when selectorType = &quot;numTopFeatures&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`percentile`.</span>
<span class="sd"> Only applicable when selectorType = &quot;percentile&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">percentile</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fpr`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fpr&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fpr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fdr`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fdr&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fdr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fwe`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fwe&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fwe</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_SelectorModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_SelectorParams</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Selector models.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">P</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span>
<div class="viewcode-block" id="ChiSqSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">ChiSqSelector</span><span class="p">(</span>
<span class="n">_Selector</span><span class="p">[</span><span class="s2">&quot;ChiSqSelectorModel&quot;</span><span class="p">],</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;ChiSqSelector&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Chi-Squared feature selection, which selects categorical features to use for predicting a</span>
<span class="sd"> categorical label.</span>
<span class="sd"> The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,</span>
<span class="sd"> `fdr`, `fwe`.</span>
<span class="sd"> * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.</span>
<span class="sd"> * `percentile` is similar but chooses a fraction of all features</span>
<span class="sd"> instead of a fixed number.</span>
<span class="sd"> * `fpr` chooses all features whose p-values are below a threshold,</span>
<span class="sd"> thus controlling the false positive rate of selection.</span>
<span class="sd"> * `fdr` uses the `Benjamini-Hochberg procedure &lt;https://en.wikipedia.org/wiki/</span>
<span class="sd"> False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure&gt;`_</span>
<span class="sd"> to choose all features whose false discovery rate is below a threshold.</span>
<span class="sd"> * `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span>
<span class="sd"> 1/numFeatures, thus controlling the family-wise error rate of selection.</span>
<span class="sd"> By default, the selection method is `numTopFeatures`, with the default number of top features</span>
<span class="sd"> set to 50.</span>
<span class="sd"> .. deprecated:: 3.1.0</span>
<span class="sd"> Use UnivariateFeatureSelector</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),</span>
<span class="sd"> ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),</span>
<span class="sd"> ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],</span>
<span class="sd"> ... [&quot;features&quot;, &quot;label&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = ChiSqSelector(numTopFeatures=1, outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> ChiSqSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([18.0])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [2]</span>
<span class="sd"> &gt;&gt;&gt; chiSqSelectorPath = temp_path + &quot;/chi-sq-selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(chiSqSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = ChiSqSelector.load(chiSqSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/chi-sq-selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = ChiSqSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numTopFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">selectorType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="n">percentile</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
<span class="n">fpr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">fdr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">fwe</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numTopFeatures=50, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectorType=&quot;numTopFeatures&quot;, percentile=0.1, fpr=0.05, \</span>
<span class="sd"> fdr=0.05, fwe=0.05)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">ChiSqSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.ChiSqSelector&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="ChiSqSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">numTopFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">selectorType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="n">percentile</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
<span class="n">fpr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">fdr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">fwe</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChiSqSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numTopFeatures=50, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectorType=&quot;numTopFeatures&quot;, percentile=0.1, fpr=0.05, \</span>
<span class="sd"> fdr=0.05, fwe=0.05)</span>
<span class="sd"> Sets params for this ChiSqSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChiSqSelectorModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ChiSqSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="ChiSqSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelectorModel.html#pyspark.ml.feature.ChiSqSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">ChiSqSelectorModel</span><span class="p">(</span><span class="n">_SelectorModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;ChiSqSelectorModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`ChiSqSelector`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="VectorSizeHint"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorSizeHint</span><span class="p">(</span>
<span class="n">JavaTransformer</span><span class="p">,</span>
<span class="n">HasInputCol</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that adds size information to the metadata of a vector column.</span>
<span class="sd"> VectorAssembler needs size information for its input columns and cannot be used on streaming</span>
<span class="sd"> dataframes without this metadata.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> VectorSizeHint modifies `inputCol` to include size metadata and does not have an outputCol.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml import Pipeline, PipelineModel</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.dense([1., 2., 3.]), 4.)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;vector&quot;, &quot;float&quot;])</span>
<span class="sd"> &gt;&gt;&gt;</span>
<span class="sd"> &gt;&gt;&gt; sizeHint = VectorSizeHint(inputCol=&quot;vector&quot;, size=3, handleInvalid=&quot;skip&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler = VectorAssembler(inputCols=[&quot;vector&quot;, &quot;float&quot;], outputCol=&quot;assembled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; pipeline = Pipeline(stages=[sizeHint, vecAssembler])</span>
<span class="sd"> &gt;&gt;&gt;</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel = pipeline.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel.transform(df).head().assembled</span>
<span class="sd"> DenseVector([1.0, 2.0, 3.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorSizeHintPath = temp_path + &quot;/vector-size-hint-pipeline&quot;</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel.save(vectorSizeHintPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPipeline = PipelineModel.load(vectorSizeHintPath)</span>
<span class="sd"> &gt;&gt;&gt; loaded = loadedPipeline.transform(df).head().assembled</span>
<span class="sd"> &gt;&gt;&gt; expected = pipelineModel.transform(df).head().assembled</span>
<span class="sd"> &gt;&gt;&gt; loaded == expected</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="n">size</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;size&quot;</span><span class="p">,</span> <span class="s2">&quot;Size of vectors in column.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span>
<span class="p">)</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to handle invalid vectors in inputCol. Invalid vectors include &quot;</span>
<span class="s2">&quot;nulls and vectors with the wrong size. The options are `skip` (filter &quot;</span>
<span class="s2">&quot;out rows with invalid vectors), `error` (throw an error) and &quot;</span>
<span class="s2">&quot;`optimistic` (do not check the vector size, and keep all rows). &quot;</span>
<span class="s2">&quot;`error` by default.&quot;</span><span class="p">,</span>
<span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, size=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorSizeHint</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorSizeHint&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorSizeHint.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, size=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorSizeHint.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.getSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.getSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Gets size param, the size of vectors in `inputCol`.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">size</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets size param, the size of vectors in `inputCol`.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VectorSizeHint&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_VarianceThresholdSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`VarianceThresholdSelector` and</span>
<span class="sd"> :py:class:`VarianceThresholdSelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">varianceThreshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;varianceThreshold&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for variance threshold. Features with a variance not &quot;</span>
<span class="o">+</span> <span class="s2">&quot;greater than this threshold will be removed. The default value &quot;</span>
<span class="o">+</span> <span class="s2">&quot;is 0.0.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of varianceThreshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">varianceThreshold</span><span class="p">)</span>
<div class="viewcode-block" id="VarianceThresholdSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VarianceThresholdSelector</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">],</span>
<span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature selector that removes all low-variance features. Features with a</span>
<span class="sd"> (sample) variance not greater than the threshold will be removed. The default is to keep</span>
<span class="sd"> all features with non-zero variance, i.e. remove the features that have the</span>
<span class="sd"> same value in all samples.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([6.0, 7.0, 0.0, 7.0, 6.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 6.0, 0.0, 5.0, 9.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 3.0, 0.0, 5.0, 5.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 8.0, 5.0, 6.0, 4.0]),),</span>
<span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 5.0, 4.0, 4.0]),),</span>
<span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 0.0, 0.0, 0.0]),)],</span>
<span class="sd"> ... [&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = VarianceThresholdSelector(varianceThreshold=8.2, outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> VarianceThresholdSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([6.0, 7.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [0, 3, 5]</span>
<span class="sd"> &gt;&gt;&gt; varianceThresholdSelectorPath = temp_path + &quot;/variance-threshold-selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(varianceThresholdSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = VarianceThresholdSelector.load(varianceThresholdSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getVarianceThreshold() == selector.getVarianceThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/variance-threshold-selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = VarianceThresholdSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">varianceThreshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, varianceThreshold=0.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VarianceThresholdSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.VarianceThresholdSelector&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VarianceThresholdSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">varianceThreshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, varianceThreshold=0.0)</span>
<span class="sd"> Sets params for this VarianceThresholdSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setVarianceThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setVarianceThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`varianceThreshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">VarianceThresholdSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">VarianceThresholdSelectorModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`VarianceThresholdSelector`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="VarianceThresholdSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;VarianceThresholdSelectorModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_UnivariateFeatureSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`UnivariateFeatureSelector` and</span>
<span class="sd"> :py:class:`UnivariateFeatureSelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">featureType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;featureType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The feature type. &quot;</span> <span class="o">+</span> <span class="s2">&quot;Supported options: categorical, continuous.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">labelType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;labelType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The label type. &quot;</span> <span class="o">+</span> <span class="s2">&quot;Supported options: categorical, continuous.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">selectionMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;selectionMode&quot;</span><span class="p">,</span>
<span class="s2">&quot;The selection mode. &quot;</span>
<span class="o">+</span> <span class="s2">&quot;Supported options: numTopFeatures (default), percentile, fpr, &quot;</span>
<span class="o">+</span> <span class="s2">&quot;fdr, fwe.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">selectionThreshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;selectionThreshold&quot;</span><span class="p">,</span>
<span class="s2">&quot;The upper bound of the &quot;</span> <span class="o">+</span> <span class="s2">&quot;features that selector will select.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of featureType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">featureType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of labelType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labelType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectionMode or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionMode</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectionThreshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionThreshold</span><span class="p">)</span>
<div class="viewcode-block" id="UnivariateFeatureSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">UnivariateFeatureSelector</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">],</span>
<span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> UnivariateFeatureSelector</span>
<span class="sd"> Feature selector based on univariate statistical tests against labels. Currently, Spark</span>
<span class="sd"> supports three Univariate Feature Selectors: chi-squared, ANOVA F-test and F-value.</span>
<span class="sd"> User can choose Univariate Feature Selector by setting `featureType` and `labelType`,</span>
<span class="sd"> and Spark will pick the score function based on the specified `featureType` and `labelType`.</span>
<span class="sd"> The following combination of `featureType` and `labelType` are supported:</span>
<span class="sd"> - `featureType` `categorical` and `labelType` `categorical`, Spark uses chi-squared,</span>
<span class="sd"> i.e. chi2 in sklearn.</span>
<span class="sd"> - `featureType` `continuous` and `labelType` `categorical`, Spark uses ANOVA F-test,</span>
<span class="sd"> i.e. f_classif in sklearn.</span>
<span class="sd"> - `featureType` `continuous` and `labelType` `continuous`, Spark uses F-value,</span>
<span class="sd"> i.e. f_regression in sklearn.</span>
<span class="sd"> The `UnivariateFeatureSelector` supports different selection modes: `numTopFeatures`,</span>
<span class="sd"> `percentile`, `fpr`, `fdr`, `fwe`.</span>
<span class="sd"> - `numTopFeatures` chooses a fixed number of top features according to a according to a</span>
<span class="sd"> hypothesis.</span>
<span class="sd"> - `percentile` is similar but chooses a fraction of all features</span>
<span class="sd"> instead of a fixed number.</span>
<span class="sd"> - `fpr` chooses all features whose p-values are below a threshold,</span>
<span class="sd"> thus controlling the false positive rate of selection.</span>
<span class="sd"> - `fdr` uses the `Benjamini-Hochberg procedure \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure&gt;`_</span>
<span class="sd"> to choose all features whose false discovery rate is below a threshold.</span>
<span class="sd"> - `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span>
<span class="sd"> 1 / `numFeatures`, thus controlling the family-wise error rate of selection.</span>
<span class="sd"> By default, the selection mode is `numTopFeatures`.</span>
<span class="sd"> .. versionadded:: 3.1.1</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([1.7, 4.4, 7.6, 5.8, 9.6, 2.3]), 3.0),</span>
<span class="sd"> ... (Vectors.dense([8.8, 7.3, 5.7, 7.3, 2.2, 4.1]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([1.2, 9.5, 2.5, 3.1, 8.7, 2.5]), 1.0),</span>
<span class="sd"> ... (Vectors.dense([3.7, 9.2, 6.1, 4.1, 7.5, 3.8]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([8.9, 5.2, 7.8, 8.3, 5.2, 3.0]), 4.0),</span>
<span class="sd"> ... (Vectors.dense([7.9, 8.5, 9.2, 4.0, 9.4, 2.1]), 4.0)],</span>
<span class="sd"> ... [&quot;features&quot;, &quot;label&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = UnivariateFeatureSelector(outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; selector.setFeatureType(&quot;continuous&quot;).setLabelType(&quot;categorical&quot;).setSelectionThreshold(1)</span>
<span class="sd"> UnivariateFeatureSelector...</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> UnivariateFeatureSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([7.6])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [2]</span>
<span class="sd"> &gt;&gt;&gt; selectorPath = temp_path + &quot;/selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(selectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = UnivariateFeatureSelector.load(selectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getSelectionThreshold() == selector.getSelectionThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = UnivariateFeatureSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">selectionMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectionMode=&quot;numTopFeatures&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">UnivariateFeatureSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.UnivariateFeatureSelector&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="UnivariateFeatureSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">selectionMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectionMode=&quot;numTopFeatures&quot;)</span>
<span class="sd"> Sets params for this UnivariateFeatureSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setFeatureType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeatureType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featureType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featureType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setLabelType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectionMode`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectionThreshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelector&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">UnivariateFeatureSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">UnivariateFeatureSelectorModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`UnivariateFeatureSelector`.</span>
<span class="sd"> .. versionadded:: 3.1.1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UnivariateFeatureSelectorModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span></div>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">import</span> <span class="nn">pyspark.ml.feature</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SparkSession</span>
<span class="n">globs</span> <span class="o">=</span> <span class="nb">globals</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">features</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">features</span><span class="p">)</span>
<span class="c1"># The small batch size here ensures that we see multiple batches,</span>
<span class="c1"># even in these small test examples:</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[2]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;ml.feature tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;sc&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="n">testData</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;b&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">),</span>
<span class="p">],</span>
<span class="mi">2</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;stringIndDf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span>
<span class="n">temp_path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;temp_path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp_path</span>
<span class="k">try</span><span class="p">:</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">finally</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">shutil</span> <span class="kn">import</span> <span class="n">rmtree</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">rmtree</span><span class="p">(</span><span class="n">temp_path</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>