blob: 1b3f0eff3cee2579c856c8ad40a3566cea6a8cfd [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.ml.clustering &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/ml/clustering';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/clustering.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/ml/clustering.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/ml/clustering.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.ml.clustering</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.ml.clustering</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">since</span><span class="p">,</span> <span class="n">keyword_only</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">HasMaxIter</span><span class="p">,</span>
<span class="n">HasFeaturesCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">HasPredictionCol</span><span class="p">,</span>
<span class="n">HasAggregationDepth</span><span class="p">,</span>
<span class="n">HasWeightCol</span><span class="p">,</span>
<span class="n">HasTol</span><span class="p">,</span>
<span class="n">HasProbabilityCol</span><span class="p">,</span>
<span class="n">HasDistanceMeasure</span><span class="p">,</span>
<span class="n">HasCheckpointInterval</span><span class="p">,</span>
<span class="n">HasSolver</span><span class="p">,</span>
<span class="n">HasMaxBlockSizeInMB</span><span class="p">,</span>
<span class="n">Param</span><span class="p">,</span>
<span class="n">Params</span><span class="p">,</span>
<span class="n">TypeConverters</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">GeneralJavaMLWritable</span><span class="p">,</span>
<span class="n">HasTrainingSummary</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaParams</span><span class="p">,</span> <span class="n">JavaWrapper</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span><span class="p">,</span> <span class="n">_java2py</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.stat</span> <span class="kn">import</span> <span class="n">MultivariateGaussian</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vector</span><span class="p">,</span> <span class="n">Matrix</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.ml._typing</span> <span class="kn">import</span> <span class="n">M</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">,</span>
<span class="s2">&quot;BisectingKMeansModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;BisectingKMeansSummary&quot;</span><span class="p">,</span>
<span class="s2">&quot;KMeans&quot;</span><span class="p">,</span>
<span class="s2">&quot;KMeansModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;KMeansSummary&quot;</span><span class="p">,</span>
<span class="s2">&quot;GaussianMixture&quot;</span><span class="p">,</span>
<span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;GaussianMixtureSummary&quot;</span><span class="p">,</span>
<span class="s2">&quot;LDA&quot;</span><span class="p">,</span>
<span class="s2">&quot;LDAModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;LocalLDAModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;DistributedLDAModel&quot;</span><span class="p">,</span>
<span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">,</span>
<span class="p">]</span>
<span class="k">class</span> <span class="nc">ClusteringSummary</span><span class="p">(</span><span class="n">JavaWrapper</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Clustering results for a given model.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Name for column of predicted clusters in `predictions`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predictionCol&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predictions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> DataFrame produced by the model&#39;s `transform` method.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predictions&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">featuresCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Name for column of features in `predictions`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;featuresCol&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The number of clusters the model was trained with.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;k&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">cluster</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> DataFrame of predicted cluster centers for each training data point.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;cluster&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">clusterSizes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Size of (number of data points in) each cluster.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;clusterSizes&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">numIter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Number of iterations.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;numIter&quot;</span><span class="p">)</span>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">_GaussianMixtureParams</span><span class="p">(</span>
<span class="n">HasMaxIter</span><span class="p">,</span>
<span class="n">HasFeaturesCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">HasPredictionCol</span><span class="p">,</span>
<span class="n">HasProbabilityCol</span><span class="p">,</span>
<span class="n">HasTol</span><span class="p">,</span>
<span class="n">HasAggregationDepth</span><span class="p">,</span>
<span class="n">HasWeightCol</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`GaussianMixture` and :py:class:`GaussianMixtureModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;Number of independent Gaussians in the mixture model. &quot;</span> <span class="o">+</span> <span class="s2">&quot;Must be &gt; 1.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_GaussianMixtureParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">aggregationDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `k`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<div class="viewcode-block" id="GaussianMixtureModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel">[docs]</a><span class="k">class</span> <span class="nc">GaussianMixtureModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_GaussianMixtureParams</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">],</span>
<span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">&quot;GaussianMixtureSummary&quot;</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by GaussianMixture.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="GaussianMixtureModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixtureModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixtureModel.setProbabilityCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setProbabilityCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setProbabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`probabilityCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">probabilityCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">weights</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Weight for each Gaussian distribution in the mixture.</span>
<span class="sd"> This is a multinomial probability distribution over the k Gaussians,</span>
<span class="sd"> where weights[i] is the weight for Gaussian i, and weights sum to 1.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;weights&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">gaussians</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">MultivariateGaussian</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Array of :py:class:`MultivariateGaussian` where gaussians[i] represents</span>
<span class="sd"> the Multivariate Gaussian (Normal) Distribution for Gaussian i</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">jgaussians</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">gaussians</span><span class="p">()</span>
<span class="k">return</span> <span class="p">[</span>
<span class="n">MultivariateGaussian</span><span class="p">(</span><span class="n">_java2py</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">jgaussian</span><span class="o">.</span><span class="n">mean</span><span class="p">()),</span> <span class="n">_java2py</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">jgaussian</span><span class="o">.</span><span class="n">cov</span><span class="p">()))</span>
<span class="k">for</span> <span class="n">jgaussian</span> <span class="ow">in</span> <span class="n">jgaussians</span>
<span class="p">]</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">gaussiansDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieve Gaussian distributions as a DataFrame.</span>
<span class="sd"> Each row represents a Gaussian Distribution.</span>
<span class="sd"> The DataFrame has two columns: mean (Vector) and cov (Matrix).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;gaussiansDF&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixtureSummary&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span>
<span class="sd"> training set. An exception is thrown if no summary exists.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span>
<span class="k">return</span> <span class="n">GaussianMixtureSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">GaussianMixtureModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
<span class="s2">&quot;No training summary available for this </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<div class="viewcode-block" id="GaussianMixtureModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Predict label for the given features.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predict&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixtureModel.predictProbability"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.predictProbability">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predictProbability</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Predict probability for the given features.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predictProbability&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="GaussianMixture"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">GaussianMixture</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="n">GaussianMixtureModel</span><span class="p">],</span>
<span class="n">_GaussianMixtureParams</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;GaussianMixture&quot;</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> GaussianMixture clustering.</span>
<span class="sd"> This class performs expectation maximization for multivariate Gaussian</span>
<span class="sd"> Mixture Models (GMMs). A GMM represents a composite distribution of</span>
<span class="sd"> independent Gaussian distributions with associated &quot;mixing&quot; weights</span>
<span class="sd"> specifying each&#39;s contribution to the composite.</span>
<span class="sd"> Given a set of sample points, this class will maximize the log-likelihood</span>
<span class="sd"> for a mixture of k Gaussians, iterating until the log-likelihood changes by</span>
<span class="sd"> less than convergenceTol, or until it has reached the max number of iterations.</span>
<span class="sd"> While this process is generally guaranteed to converge, it is not guaranteed</span>
<span class="sd"> to find a global optimum.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> For high-dimensional data (with many features), this algorithm may perform poorly.</span>
<span class="sd"> This is due to high-dimensional data (a) making it difficult to cluster at all</span>
<span class="sd"> (based on statistical/theoretical arguments) and (b) numerical issues with</span>
<span class="sd"> Gaussian distributions.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.dense([-0.1, -0.05 ]),),</span>
<span class="sd"> ... (Vectors.dense([-0.01, -0.1]),),</span>
<span class="sd"> ... (Vectors.dense([0.9, 0.8]),),</span>
<span class="sd"> ... (Vectors.dense([0.75, 0.935]),),</span>
<span class="sd"> ... (Vectors.dense([-0.83, -0.68]),),</span>
<span class="sd"> ... (Vectors.dense([-0.91, -0.76]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; gm = GaussianMixture(k=3, tol=0.0001, seed=10)</span>
<span class="sd"> &gt;&gt;&gt; gm.getMaxIter()</span>
<span class="sd"> 100</span>
<span class="sd"> &gt;&gt;&gt; gm.setMaxIter(30)</span>
<span class="sd"> GaussianMixture...</span>
<span class="sd"> &gt;&gt;&gt; gm.getMaxIter()</span>
<span class="sd"> 30</span>
<span class="sd"> &gt;&gt;&gt; model = gm.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getAggregationDepth()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setPredictionCol(&quot;newPrediction&quot;)</span>
<span class="sd"> GaussianMixtureModel...</span>
<span class="sd"> &gt;&gt;&gt; model.predict(df.head().features)</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.predictProbability(df.head().features)</span>
<span class="sd"> DenseVector([0.0, 0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; model.hasSummary</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; summary = model.summary</span>
<span class="sd"> &gt;&gt;&gt; summary.k</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; summary.clusterSizes</span>
<span class="sd"> [2, 2, 2]</span>
<span class="sd"> &gt;&gt;&gt; weights = model.weights</span>
<span class="sd"> &gt;&gt;&gt; len(weights)</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; gaussians = model.gaussians</span>
<span class="sd"> &gt;&gt;&gt; len(gaussians)</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; gaussians[0].mean</span>
<span class="sd"> DenseVector([0.825, 0.8675])</span>
<span class="sd"> &gt;&gt;&gt; gaussians[0].cov</span>
<span class="sd"> DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], 0)</span>
<span class="sd"> &gt;&gt;&gt; gaussians[1].mean</span>
<span class="sd"> DenseVector([-0.87, -0.72])</span>
<span class="sd"> &gt;&gt;&gt; gaussians[1].cov</span>
<span class="sd"> DenseMatrix(2, 2, [0.0016, 0.0016, 0.0016, 0.0016], 0)</span>
<span class="sd"> &gt;&gt;&gt; gaussians[2].mean</span>
<span class="sd"> DenseVector([-0.055, -0.075])</span>
<span class="sd"> &gt;&gt;&gt; gaussians[2].cov</span>
<span class="sd"> DenseMatrix(2, 2, [0.002, -0.0011, -0.0011, 0.0006], 0)</span>
<span class="sd"> &gt;&gt;&gt; model.gaussiansDF.select(&quot;mean&quot;).head()</span>
<span class="sd"> Row(mean=DenseVector([0.825, 0.8675]))</span>
<span class="sd"> &gt;&gt;&gt; model.gaussiansDF.select(&quot;cov&quot;).head()</span>
<span class="sd"> Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span>
<span class="sd"> &gt;&gt;&gt; transformed = model.transform(df).select(&quot;features&quot;, &quot;newPrediction&quot;)</span>
<span class="sd"> &gt;&gt;&gt; rows = transformed.collect()</span>
<span class="sd"> &gt;&gt;&gt; rows[4].newPrediction == rows[5].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; rows[2].newPrediction == rows[3].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; gmm_path = temp_path + &quot;/gmm&quot;</span>
<span class="sd"> &gt;&gt;&gt; gm.save(gmm_path)</span>
<span class="sd"> &gt;&gt;&gt; gm2 = GaussianMixture.load(gmm_path)</span>
<span class="sd"> &gt;&gt;&gt; gm2.getK()</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; model_path = temp_path + &quot;/gmm_model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2 = GaussianMixtureModel.load(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2.hasSummary</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; model2.weights == model.weights</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[0].mean == model.gaussians[0].mean</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[0].cov == model.gaussians[0].cov</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[1].mean == model.gaussians[1].mean</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[1].cov == model.gaussians[1].cov</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[2].mean == model.gaussians[2].mean</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussians[2].cov == model.gaussians[2].cov</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussiansDF.select(&quot;mean&quot;).head()</span>
<span class="sd"> Row(mean=DenseVector([0.825, 0.8675]))</span>
<span class="sd"> &gt;&gt;&gt; model2.gaussiansDF.select(&quot;cov&quot;).head()</span>
<span class="sd"> Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).take(1) == model2.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; gm2.setWeightCol(&quot;weight&quot;)</span>
<span class="sd"> GaussianMixture...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">probabilityCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;probability&quot;</span><span class="p">,</span>
<span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.01</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">aggregationDepth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, k=2, \</span>
<span class="sd"> probabilityCol=&quot;probability&quot;, tol=0.01, maxIter=100, seed=None, \</span>
<span class="sd"> aggregationDepth=2, weightCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">GaussianMixture</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.clustering.GaussianMixture&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixtureModel&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">GaussianMixtureModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<div class="viewcode-block" id="GaussianMixture.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">probabilityCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;probability&quot;</span><span class="p">,</span>
<span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.01</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">aggregationDepth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, k=2, \</span>
<span class="sd"> probabilityCol=&quot;probability&quot;, tol=0.01, maxIter=100, seed=None, \</span>
<span class="sd"> aggregationDepth=2, weightCol=None)</span>
<span class="sd"> Sets params for GaussianMixture.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setProbabilityCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setProbabilityCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setProbabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`probabilityCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">probabilityCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`weightCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setTol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setTol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setTol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`tol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">tol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="GaussianMixture.setAggregationDepth"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setAggregationDepth">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setAggregationDepth</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GaussianMixture&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`aggregationDepth`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">aggregationDepth</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="GaussianMixtureSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureSummary.html#pyspark.ml.clustering.GaussianMixtureSummary">[docs]</a><span class="k">class</span> <span class="nc">GaussianMixtureSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gaussian mixture clustering results for a given model.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">probabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Name for column of predicted probability of each cluster in `predictions`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;probabilityCol&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">probability</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> DataFrame of probabilities of each cluster for each training data point.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;probability&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">logLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Total log-likelihood for this model on the given data.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;logLikelihood&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeansSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansSummary.html#pyspark.ml.clustering.KMeansSummary">[docs]</a><span class="k">class</span> <span class="nc">KMeansSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Summary of KMeans.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">trainingCost</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> K-means cost (sum of squared distances to the nearest centroid for all points in the</span>
<span class="sd"> training dataset). This is equivalent to sklearn&#39;s inertia.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;trainingCost&quot;</span><span class="p">)</span></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">_KMeansParams</span><span class="p">(</span>
<span class="n">HasMaxIter</span><span class="p">,</span>
<span class="n">HasFeaturesCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">HasPredictionCol</span><span class="p">,</span>
<span class="n">HasTol</span><span class="p">,</span>
<span class="n">HasDistanceMeasure</span><span class="p">,</span>
<span class="n">HasWeightCol</span><span class="p">,</span>
<span class="n">HasSolver</span><span class="p">,</span>
<span class="n">HasMaxBlockSizeInMB</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`KMeans` and :py:class:`KMeansModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;The number of clusters to create. Must be &gt; 1.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">initMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;initMode&quot;</span><span class="p">,</span>
<span class="s1">&#39;The initialization algorithm. This can be either &quot;random&quot; to &#39;</span>
<span class="o">+</span> <span class="s1">&#39;choose random points as initial cluster centers, or &quot;k-means||&quot; &#39;</span>
<span class="o">+</span> <span class="s2">&quot;to use a parallel variant of k-means++&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">initSteps</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;initSteps&quot;</span><span class="p">,</span>
<span class="s2">&quot;The number of steps for k-means|| &quot;</span> <span class="o">+</span> <span class="s2">&quot;initialization mode. Must be &gt; 0.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">solver</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;solver&quot;</span><span class="p">,</span>
<span class="s2">&quot;The solver algorithm for optimization. Supported &quot;</span> <span class="o">+</span> <span class="s2">&quot;options: auto, row, block.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_KMeansParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="n">initMode</span><span class="o">=</span><span class="s2">&quot;k-means||&quot;</span><span class="p">,</span>
<span class="n">initSteps</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="n">tol</span><span class="o">=</span><span class="mf">1e-4</span><span class="p">,</span>
<span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span>
<span class="n">distanceMeasure</span><span class="o">=</span><span class="s2">&quot;euclidean&quot;</span><span class="p">,</span>
<span class="n">solver</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">maxBlockSizeInMB</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `k`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `initMode`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initMode</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getInitSteps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `initSteps`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initSteps</span><span class="p">)</span>
<div class="viewcode-block" id="KMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel">[docs]</a><span class="k">class</span> <span class="nc">KMeansModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_KMeansParams</span><span class="p">,</span>
<span class="n">GeneralJavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;KMeansModel&quot;</span><span class="p">],</span>
<span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">&quot;KMeansSummary&quot;</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by KMeans.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="KMeansModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeansModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeansModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeansModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeansModel.clusterCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.clusterCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Get the cluster centers, represented as a list of NumPy arrays.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;clusterCenters&quot;</span><span class="p">)]</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">KMeansSummary</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span>
<span class="sd"> training set. An exception is thrown if no summary exists.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span>
<span class="k">return</span> <span class="n">KMeansSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">KMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
<span class="s2">&quot;No training summary available for this </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<div class="viewcode-block" id="KMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Predict label for the given features.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predict&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="KMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">KMeans</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">KMeansModel</span><span class="p">],</span> <span class="n">_KMeansParams</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;KMeans&quot;</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> K-means clustering with a k-means++ like initialization mode</span>
<span class="sd"> (the k-means|| algorithm by Bahmani et al).</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;features&quot;, &quot;weighCol&quot;])</span>
<span class="sd"> &gt;&gt;&gt; kmeans = KMeans(k=2)</span>
<span class="sd"> &gt;&gt;&gt; kmeans.setSeed(1)</span>
<span class="sd"> KMeans...</span>
<span class="sd"> &gt;&gt;&gt; kmeans.setWeightCol(&quot;weighCol&quot;)</span>
<span class="sd"> KMeans...</span>
<span class="sd"> &gt;&gt;&gt; kmeans.setMaxIter(10)</span>
<span class="sd"> KMeans...</span>
<span class="sd"> &gt;&gt;&gt; kmeans.getMaxIter()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; kmeans.clear(kmeans.maxIter)</span>
<span class="sd"> &gt;&gt;&gt; kmeans.getSolver()</span>
<span class="sd"> &#39;auto&#39;</span>
<span class="sd"> &gt;&gt;&gt; model = kmeans.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getMaxBlockSizeInMB()</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; model.getDistanceMeasure()</span>
<span class="sd"> &#39;euclidean&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setPredictionCol(&quot;newPrediction&quot;)</span>
<span class="sd"> KMeansModel...</span>
<span class="sd"> &gt;&gt;&gt; model.predict(df.head().features)</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; centers = model.clusterCenters()</span>
<span class="sd"> &gt;&gt;&gt; len(centers)</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; transformed = model.transform(df).select(&quot;features&quot;, &quot;newPrediction&quot;)</span>
<span class="sd"> &gt;&gt;&gt; rows = transformed.collect()</span>
<span class="sd"> &gt;&gt;&gt; rows[0].newPrediction == rows[1].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; rows[2].newPrediction == rows[3].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model.hasSummary</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; summary = model.summary</span>
<span class="sd"> &gt;&gt;&gt; summary.k</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; summary.clusterSizes</span>
<span class="sd"> [2, 2]</span>
<span class="sd"> &gt;&gt;&gt; summary.trainingCost</span>
<span class="sd"> 4.0</span>
<span class="sd"> &gt;&gt;&gt; kmeans_path = temp_path + &quot;/kmeans&quot;</span>
<span class="sd"> &gt;&gt;&gt; kmeans.save(kmeans_path)</span>
<span class="sd"> &gt;&gt;&gt; kmeans2 = KMeans.load(kmeans_path)</span>
<span class="sd"> &gt;&gt;&gt; kmeans2.getK()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model_path = temp_path + &quot;/kmeans_model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2 = KMeansModel.load(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2.hasSummary</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; model.clusterCenters()[0] == model2.clusterCenters()[0]</span>
<span class="sd"> array([ True, True], dtype=bool)</span>
<span class="sd"> &gt;&gt;&gt; model.clusterCenters()[1] == model2.clusterCenters()[1]</span>
<span class="sd"> array([ True, True], dtype=bool)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).take(1) == model2.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;k-means||&quot;</span><span class="p">,</span>
<span class="n">initSteps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-4</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;euclidean&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">solver</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">maxBlockSizeInMB</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, k=2, \</span>
<span class="sd"> initMode=&quot;k-means||&quot;, initSteps=2, tol=1e-4, maxIter=20, seed=None, \</span>
<span class="sd"> distanceMeasure=&quot;euclidean&quot;, weightCol=None, solver=&quot;auto&quot;, \</span>
<span class="sd"> maxBlockSizeInMB=0.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">KMeans</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.clustering.KMeans&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">KMeansModel</span><span class="p">:</span>
<span class="k">return</span> <span class="n">KMeansModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<div class="viewcode-block" id="KMeans.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;k-means||&quot;</span><span class="p">,</span>
<span class="n">initSteps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-4</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;euclidean&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">solver</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">maxBlockSizeInMB</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, k=2, \</span>
<span class="sd"> initMode=&quot;k-means||&quot;, initSteps=2, tol=1e-4, maxIter=20, seed=None, \</span>
<span class="sd"> distanceMeasure=&quot;euclidean&quot;, weightCol=None, solver=&quot;auto&quot;, \</span>
<span class="sd"> maxBlockSizeInMB=0.0)</span>
<span class="sd"> Sets params for KMeans.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setInitMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setInitMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`initMode`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setInitSteps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setInitSteps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInitSteps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`initSteps`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initSteps</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setDistanceMeasure"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setDistanceMeasure">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDistanceMeasure</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`distanceMeasure`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">distanceMeasure</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setTol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setTol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setTol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`tol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">tol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`weightCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setSolver"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setSolver">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSolver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`solver`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">solver</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="KMeans.setMaxBlockSizeInMB"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setMaxBlockSizeInMB">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxBlockSizeInMB</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;KMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxBlockSizeInMB`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxBlockSizeInMB</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">_BisectingKMeansParams</span><span class="p">(</span>
<span class="n">HasMaxIter</span><span class="p">,</span>
<span class="n">HasFeaturesCol</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span>
<span class="n">HasPredictionCol</span><span class="p">,</span>
<span class="n">HasDistanceMeasure</span><span class="p">,</span>
<span class="n">HasWeightCol</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`BisectingKMeans` and :py:class:`BisectingKMeansModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;The desired number of leaf clusters. Must be &gt; 1.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;minDivisibleClusterSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;The minimum number of points (if &gt;= 1.0) or the minimum &quot;</span>
<span class="o">+</span> <span class="s2">&quot;proportion of points (if &lt; 1.0) of a divisible cluster.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_BisectingKMeansParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">minDivisibleClusterSize</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `k` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinDivisibleClusterSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of `minDivisibleClusterSize` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDivisibleClusterSize</span><span class="p">)</span>
<div class="viewcode-block" id="BisectingKMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel">[docs]</a><span class="k">class</span> <span class="nc">BisectingKMeansModel</span><span class="p">(</span>
<span class="n">JavaModel</span><span class="p">,</span>
<span class="n">_BisectingKMeansParams</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;BisectingKMeansModel&quot;</span><span class="p">],</span>
<span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">&quot;BisectingKMeansSummary&quot;</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by BisectingKMeans.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="BisectingKMeansModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeansModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeansModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeansModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeansModel.clusterCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.clusterCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Get the cluster centers, represented as a list of NumPy arrays.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;clusterCenters&quot;</span><span class="p">)]</span></div>
<div class="viewcode-block" id="BisectingKMeansModel.computeCost"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.computeCost">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">computeCost</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the sum of squared distances between the input points</span>
<span class="sd"> and their corresponding cluster centers.</span>
<span class="sd"> .. deprecated:: 3.0.0</span>
<span class="sd"> It will be removed in future versions. Use :py:class:`ClusteringEvaluator` instead.</span>
<span class="sd"> You can also get the cost on the training dataset in the summary.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Deprecated in 3.0.0. It will be removed in future versions. Use &quot;</span>
<span class="s2">&quot;ClusteringEvaluator instead. You can also get the cost on the training &quot;</span>
<span class="s2">&quot;dataset in the summary.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;computeCost&quot;</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeansSummary&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span>
<span class="sd"> training set. An exception is thrown if no summary exists.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span>
<span class="k">return</span> <span class="n">BisectingKMeansSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">BisectingKMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
<span class="s2">&quot;No training summary available for this </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<div class="viewcode-block" id="BisectingKMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Predict label for the given features.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;predict&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="BisectingKMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">BisectingKMeans</span><span class="p">(</span>
<span class="n">JavaEstimator</span><span class="p">[</span><span class="n">BisectingKMeansModel</span><span class="p">],</span>
<span class="n">_BisectingKMeansParams</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A bisecting k-means algorithm based on the paper &quot;A comparison of document clustering</span>
<span class="sd"> techniques&quot; by Steinbach, Karypis, and Kumar, with modification to fit Spark.</span>
<span class="sd"> The algorithm starts from a single cluster that contains all points.</span>
<span class="sd"> Iteratively it finds divisible clusters on the bottom level and bisects each of them using</span>
<span class="sd"> k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible.</span>
<span class="sd"> The bisecting steps of clusters on the same level are grouped together to increase parallelism.</span>
<span class="sd"> If bisecting all divisible clusters on the bottom level would result more than `k` leaf</span>
<span class="sd"> clusters, larger clusters get higher priority.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;features&quot;, &quot;weighCol&quot;])</span>
<span class="sd"> &gt;&gt;&gt; bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)</span>
<span class="sd"> &gt;&gt;&gt; bkm.setMaxIter(10)</span>
<span class="sd"> BisectingKMeans...</span>
<span class="sd"> &gt;&gt;&gt; bkm.getMaxIter()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; bkm.clear(bkm.maxIter)</span>
<span class="sd"> &gt;&gt;&gt; bkm.setSeed(1)</span>
<span class="sd"> BisectingKMeans...</span>
<span class="sd"> &gt;&gt;&gt; bkm.setWeightCol(&quot;weighCol&quot;)</span>
<span class="sd"> BisectingKMeans...</span>
<span class="sd"> &gt;&gt;&gt; bkm.getSeed()</span>
<span class="sd"> 1</span>
<span class="sd"> &gt;&gt;&gt; bkm.clear(bkm.seed)</span>
<span class="sd"> &gt;&gt;&gt; model = bkm.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getMaxIter()</span>
<span class="sd"> 20</span>
<span class="sd"> &gt;&gt;&gt; model.setPredictionCol(&quot;newPrediction&quot;)</span>
<span class="sd"> BisectingKMeansModel...</span>
<span class="sd"> &gt;&gt;&gt; model.predict(df.head().features)</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; centers = model.clusterCenters()</span>
<span class="sd"> &gt;&gt;&gt; len(centers)</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.computeCost(df)</span>
<span class="sd"> 2.0</span>
<span class="sd"> &gt;&gt;&gt; model.hasSummary</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; summary = model.summary</span>
<span class="sd"> &gt;&gt;&gt; summary.k</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; summary.clusterSizes</span>
<span class="sd"> [2, 2]</span>
<span class="sd"> &gt;&gt;&gt; summary.trainingCost</span>
<span class="sd"> 4.000...</span>
<span class="sd"> &gt;&gt;&gt; transformed = model.transform(df).select(&quot;features&quot;, &quot;newPrediction&quot;)</span>
<span class="sd"> &gt;&gt;&gt; rows = transformed.collect()</span>
<span class="sd"> &gt;&gt;&gt; rows[0].newPrediction == rows[1].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; rows[2].newPrediction == rows[3].newPrediction</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; bkm_path = temp_path + &quot;/bkm&quot;</span>
<span class="sd"> &gt;&gt;&gt; bkm.save(bkm_path)</span>
<span class="sd"> &gt;&gt;&gt; bkm2 = BisectingKMeans.load(bkm_path)</span>
<span class="sd"> &gt;&gt;&gt; bkm2.getK()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; bkm2.getDistanceMeasure()</span>
<span class="sd"> &#39;euclidean&#39;</span>
<span class="sd"> &gt;&gt;&gt; model_path = temp_path + &quot;/bkm_model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2 = BisectingKMeansModel.load(model_path)</span>
<span class="sd"> &gt;&gt;&gt; model2.hasSummary</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; model.clusterCenters()[0] == model2.clusterCenters()[0]</span>
<span class="sd"> array([ True, True], dtype=bool)</span>
<span class="sd"> &gt;&gt;&gt; model.clusterCenters()[1] == model2.clusterCenters()[1]</span>
<span class="sd"> array([ True, True], dtype=bool)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).take(1) == model2.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span>
<span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;euclidean&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, maxIter=20, \</span>
<span class="sd"> seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure=&quot;euclidean&quot;, \</span>
<span class="sd"> weightCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">BisectingKMeans</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.clustering.BisectingKMeans&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="BisectingKMeans.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;prediction&quot;</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span>
<span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
<span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;euclidean&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, predictionCol=&quot;prediction&quot;, maxIter=20, \</span>
<span class="sd"> seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure=&quot;euclidean&quot;, \</span>
<span class="sd"> weightCol=None)</span>
<span class="sd"> Sets params for BisectingKMeans.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setMinDivisibleClusterSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setMinDivisibleClusterSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinDivisibleClusterSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minDivisibleClusterSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDivisibleClusterSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setDistanceMeasure"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setDistanceMeasure">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDistanceMeasure</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`distanceMeasure`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">distanceMeasure</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`predictionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeans.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BisectingKMeans&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`weightCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">BisectingKMeansModel</span><span class="p">:</span>
<span class="k">return</span> <span class="n">BisectingKMeansModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="BisectingKMeansSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansSummary.html#pyspark.ml.clustering.BisectingKMeansSummary">[docs]</a><span class="k">class</span> <span class="nc">BisectingKMeansSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Bisecting KMeans clustering results for a given model.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">trainingCost</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sum of squared distances to the nearest centroid for all points in the training dataset.</span>
<span class="sd"> This is equivalent to sklearn&#39;s inertia.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;trainingCost&quot;</span><span class="p">)</span></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">_LDAParams</span><span class="p">(</span><span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasCheckpointInterval</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`LDA` and :py:class:`LDAModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;The number of topics (clusters) to infer. Must be &gt; 1.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">optimizer</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;optimizer&quot;</span><span class="p">,</span>
<span class="s2">&quot;Optimizer or inference algorithm used to estimate the LDA model. &quot;</span>
<span class="s2">&quot;Supported: online, em&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">learningOffset</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;learningOffset&quot;</span><span class="p">,</span>
<span class="s2">&quot;A (positive) learning parameter that downweights early iterations.&quot;</span>
<span class="s2">&quot; Larger values make early iterations count less&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">learningDecay</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;learningDecay&quot;</span><span class="p">,</span>
<span class="s2">&quot;Learning rate, set as an&quot;</span>
<span class="s2">&quot;exponential decay rate. This should be between (0.5, 1.0] to &quot;</span>
<span class="s2">&quot;guarantee asymptotic convergence.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">subsamplingRate</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;subsamplingRate&quot;</span><span class="p">,</span>
<span class="s2">&quot;Fraction of the corpus to be sampled and used in each iteration &quot;</span>
<span class="s2">&quot;of mini-batch gradient descent, in range (0, 1].&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;optimizeDocConcentration&quot;</span><span class="p">,</span>
<span class="s2">&quot;Indicates whether the docConcentration (Dirichlet parameter &quot;</span>
<span class="s2">&quot;for document-topic distribution) will be optimized during &quot;</span>
<span class="s2">&quot;training.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">docConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;docConcentration&quot;</span><span class="p">,</span>
<span class="s1">&#39;Concentration parameter (commonly named &quot;alpha&quot;) for the &#39;</span>
<span class="s1">&#39;prior placed on documents</span><span class="se">\&#39;</span><span class="s1"> distributions over topics (&quot;theta&quot;).&#39;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;topicConcentration&quot;</span><span class="p">,</span>
<span class="s1">&#39;Concentration parameter (commonly named &quot;beta&quot; or &quot;eta&quot;) for &#39;</span>
<span class="s2">&quot;the prior placed on topic&#39; distributions over terms.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">topicDistributionCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;topicDistributionCol&quot;</span><span class="p">,</span>
<span class="s2">&quot;Output column with estimates of the topic mixture distribution &quot;</span>
<span class="s1">&#39;for each document (often called &quot;theta&quot; in the literature). &#39;</span>
<span class="s2">&quot;Returns a vector of zeros for an empty document.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;keepLastCheckpoint&quot;</span><span class="p">,</span>
<span class="s2">&quot;(For EM optimizer) If using checkpointing, this indicates whether&quot;</span>
<span class="s2">&quot; to keep the last checkpoint. If false, then the checkpoint will be&quot;</span>
<span class="s2">&quot; deleted. Deleting the checkpoint can cause failures if a data&quot;</span>
<span class="s2">&quot; partition is lost, so set this bit with care.&quot;</span><span class="p">,</span>
<span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_LDAParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span>
<span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span>
<span class="n">checkpointInterval</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="n">k</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="n">optimizer</span><span class="o">=</span><span class="s2">&quot;online&quot;</span><span class="p">,</span>
<span class="n">learningOffset</span><span class="o">=</span><span class="mf">1024.0</span><span class="p">,</span>
<span class="n">learningDecay</span><span class="o">=</span><span class="mf">0.51</span><span class="p">,</span>
<span class="n">subsamplingRate</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="n">optimizeDocConcentration</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">topicDistributionCol</span><span class="o">=</span><span class="s2">&quot;topicDistribution&quot;</span><span class="p">,</span>
<span class="n">keepLastCheckpoint</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`k` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getOptimizer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`optimizer` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLearningOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`learningOffset` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">learningOffset</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLearningDecay</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`learningDecay` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">learningDecay</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSubsamplingRate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`subsamplingRate` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">subsamplingRate</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getOptimizeDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`optimizeDocConcentration` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">optimizeDocConcentration</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`docConcentration` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">docConcentration</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getTopicConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`topicConcentration` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">topicConcentration</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`topicDistributionCol` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">topicDistributionCol</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getKeepLastCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`keepLastCheckpoint` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keepLastCheckpoint</span><span class="p">)</span>
<div class="viewcode-block" id="LDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">LDAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_LDAParams</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Latent Dirichlet Allocation (LDA) model.</span>
<span class="sd"> This abstraction permits for different underlying representations,</span>
<span class="sd"> including local and distributed data structures.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="LDAModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">&quot;M&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;M&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">&quot;M&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;M&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.setTopicDistributionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setTopicDistributionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">&quot;M&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;M&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`topicDistributionCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicDistributionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.isDistributed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.isDistributed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">isDistributed</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Indicates whether this instance is of type DistributedLDAModel</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;isDistributed&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.vocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.vocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">vocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Vocabulary size (number of terms or words in the vocabulary)&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;vocabSize&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.topicsMatrix"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.topicsMatrix">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">topicsMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Matrix</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Inferred topics, where each topic is represented by a distribution over terms.</span>
<span class="sd"> This is a matrix of size vocabSize x k, where each column is a topic.</span>
<span class="sd"> No guarantees are given about the ordering of the topics.</span>
<span class="sd"> .. warning:: If this model is actually a :py:class:`DistributedLDAModel`</span>
<span class="sd"> instance produced by the Expectation-Maximization (&quot;em&quot;) `optimizer`,</span>
<span class="sd"> then this method could involve collecting a large amount of data</span>
<span class="sd"> to the driver (on the order of vocabSize x k).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;topicsMatrix&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.logLikelihood"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.logLikelihood">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">logLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates a lower bound on the log likelihood of the entire corpus.</span>
<span class="sd"> See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</span>
<span class="sd"> .. warning:: If this model is an instance of :py:class:`DistributedLDAModel` (produced when</span>
<span class="sd"> :py:attr:`optimizer` is set to &quot;em&quot;), this involves collecting a large</span>
<span class="sd"> :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;logLikelihood&quot;</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.logPerplexity"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.logPerplexity">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">logPerplexity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculate an upper bound on perplexity. (Lower is better.)</span>
<span class="sd"> See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</span>
<span class="sd"> .. warning:: If this model is an instance of :py:class:`DistributedLDAModel` (produced when</span>
<span class="sd"> :py:attr:`optimizer` is set to &quot;em&quot;), this involves collecting a large</span>
<span class="sd"> :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;logPerplexity&quot;</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.describeTopics"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.describeTopics">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">describeTopics</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the topics described by their top-weighted terms.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;describeTopics&quot;</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDAModel.estimatedDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.estimatedDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">estimatedDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Vector</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Value for :py:attr:`LDA.docConcentration` estimated from data.</span>
<span class="sd"> If Online LDA was used and :py:attr:`LDA.optimizeDocConcentration` was set to false,</span>
<span class="sd"> then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;estimatedDocConcentration&quot;</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="DistributedLDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">DistributedLDAModel</span><span class="p">(</span><span class="n">LDAModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;DistributedLDAModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Distributed model fitted by :py:class:`LDA`.</span>
<span class="sd"> This type of model is currently only produced by Expectation-Maximization (EM).</span>
<span class="sd"> This model stores the inferred topics, the full training dataset, and the topic distribution</span>
<span class="sd"> for each training document.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="DistributedLDAModel.toLocal"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.toLocal">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">toLocal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LocalLDAModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert this distributed model to a local representation. This discards info about the</span>
<span class="sd"> training dataset.</span>
<span class="sd"> .. warning:: This involves collecting a large :py:func:`topicsMatrix` to the driver.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">LocalLDAModel</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;toLocal&quot;</span><span class="p">))</span>
<span class="c1"># SPARK-10931: Temporary fix to be removed once LDAModel defines Params</span>
<span class="n">model</span><span class="o">.</span><span class="n">_create_params_from_java</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">_transfer_params_from_java</span><span class="p">()</span>
<span class="k">return</span> <span class="n">model</span></div>
<div class="viewcode-block" id="DistributedLDAModel.trainingLogLikelihood"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.trainingLogLikelihood">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">trainingLogLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Log likelihood of the observed tokens in the training set,</span>
<span class="sd"> given the current parameter estimates:</span>
<span class="sd"> log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - This excludes the prior; for that, use :py:func:`logPrior`.</span>
<span class="sd"> - Even with :py:func:`logPrior`, this is NOT the same as the data log likelihood given</span>
<span class="sd"> the hyperparameters.</span>
<span class="sd"> - This is computed from the topic distributions computed during training. If you call</span>
<span class="sd"> :py:func:`logLikelihood` on the same training dataset, the topic distributions</span>
<span class="sd"> will be computed again, possibly giving different results.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;trainingLogLikelihood&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="DistributedLDAModel.logPrior"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.logPrior">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">logPrior</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Log probability of the current parameter estimate:</span>
<span class="sd"> log P(topics, topic distributions for docs | alpha, eta)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;logPrior&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="DistributedLDAModel.getCheckpointFiles"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.getCheckpointFiles">[docs]</a> <span class="k">def</span> <span class="nf">getCheckpointFiles</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may</span>
<span class="sd"> be saved checkpoint files. This method is provided so that users can manage those files.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of checkpoint files from training</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Removing the checkpoints can cause failures if a partition is lost and is needed</span>
<span class="sd"> by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up</span>
<span class="sd"> the checkpoints when this model and derivative data go out of scope.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;getCheckpointFiles&quot;</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="LocalLDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LocalLDAModel.html#pyspark.ml.clustering.LocalLDAModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">LocalLDAModel</span><span class="p">(</span><span class="n">LDAModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;LocalLDAModel&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Local (non-distributed) model fitted by :py:class:`LDA`.</span>
<span class="sd"> This model stores the inferred topics only; it does not store info about the training dataset.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">pass</span></div>
<div class="viewcode-block" id="LDA"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">LDA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">LDAModel</span><span class="p">],</span> <span class="n">_LDAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;LDA&quot;</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Latent Dirichlet Allocation (LDA), a topic model designed for text documents.</span>
<span class="sd"> Terminology:</span>
<span class="sd"> - &quot;term&quot; = &quot;word&quot;: an element of the vocabulary</span>
<span class="sd"> - &quot;token&quot;: instance of a term appearing in a document</span>
<span class="sd"> - &quot;topic&quot;: multinomial distribution over terms representing some concept</span>
<span class="sd"> - &quot;document&quot;: one piece of text, corresponding to one row in the input data</span>
<span class="sd"> Original LDA paper (journal version):</span>
<span class="sd"> Blei, Ng, and Jordan. &quot;Latent Dirichlet Allocation.&quot; JMLR, 2003.</span>
<span class="sd"> Input data (featuresCol):</span>
<span class="sd"> LDA is given a collection of documents as input data, via the featuresCol parameter.</span>
<span class="sd"> Each document is specified as a :py:class:`Vector` of length vocabSize, where each entry is the</span>
<span class="sd"> count for the corresponding term (word) in the document. Feature transformers such as</span>
<span class="sd"> :py:class:`pyspark.ml.feature.Tokenizer` and :py:class:`pyspark.ml.feature.CountVectorizer`</span>
<span class="sd"> can be useful for converting text to word count vectors.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors, SparseVector</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.clustering import LDA</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])],</span>
<span class="sd"> ... [2, SparseVector(2, {0: 1.0})],], [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; lda = LDA(k=2, seed=1, optimizer=&quot;em&quot;)</span>
<span class="sd"> &gt;&gt;&gt; lda.setMaxIter(10)</span>
<span class="sd"> LDA...</span>
<span class="sd"> &gt;&gt;&gt; lda.getMaxIter()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; lda.clear(lda.maxIter)</span>
<span class="sd"> &gt;&gt;&gt; model = lda.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setSeed(1)</span>
<span class="sd"> DistributedLDAModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getTopicDistributionCol()</span>
<span class="sd"> &#39;topicDistribution&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.isDistributed()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; localModel = model.toLocal()</span>
<span class="sd"> &gt;&gt;&gt; localModel.isDistributed()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; model.vocabSize()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.describeTopics().show()</span>
<span class="sd"> +-----+-----------+--------------------+</span>
<span class="sd"> |topic|termIndices| termWeights|</span>
<span class="sd"> +-----+-----------+--------------------+</span>
<span class="sd"> | 0| [1, 0]|[0.50401530077160...|</span>
<span class="sd"> | 1| [0, 1]|[0.50401530077160...|</span>
<span class="sd"> +-----+-----------+--------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.topicsMatrix()</span>
<span class="sd"> DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)</span>
<span class="sd"> &gt;&gt;&gt; lda_path = temp_path + &quot;/lda&quot;</span>
<span class="sd"> &gt;&gt;&gt; lda.save(lda_path)</span>
<span class="sd"> &gt;&gt;&gt; sameLDA = LDA.load(lda_path)</span>
<span class="sd"> &gt;&gt;&gt; distributed_model_path = temp_path + &quot;/lda_distributed_model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(distributed_model_path)</span>
<span class="sd"> &gt;&gt;&gt; sameModel = DistributedLDAModel.load(distributed_model_path)</span>
<span class="sd"> &gt;&gt;&gt; local_model_path = temp_path + &quot;/lda_local_model&quot;</span>
<span class="sd"> &gt;&gt;&gt; localModel.save(local_model_path)</span>
<span class="sd"> &gt;&gt;&gt; sameLocalModel = LocalLDAModel.load(local_model_path)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).take(1) == sameLocalModel.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">checkpointInterval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
<span class="n">optimizer</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;online&quot;</span><span class="p">,</span>
<span class="n">learningOffset</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1024.0</span><span class="p">,</span>
<span class="n">learningDecay</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.51</span><span class="p">,</span>
<span class="n">subsamplingRate</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">docConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">topicDistributionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;topicDistribution&quot;</span><span class="p">,</span>
<span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, maxIter=20, seed=None, checkpointInterval=10,\</span>
<span class="sd"> k=10, optimizer=&quot;online&quot;, learningOffset=1024.0, learningDecay=0.51,\</span>
<span class="sd"> subsamplingRate=0.05, optimizeDocConcentration=True,\</span>
<span class="sd"> docConcentration=None, topicConcentration=None,\</span>
<span class="sd"> topicDistributionCol=&quot;topicDistribution&quot;, keepLastCheckpoint=True)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">LDA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.clustering.LDA&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">LDAModel</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOptimizer</span><span class="p">()</span> <span class="o">==</span> <span class="s2">&quot;em&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DistributedLDAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">LocalLDAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<div class="viewcode-block" id="LDA.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">checkpointInterval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
<span class="n">optimizer</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;online&quot;</span><span class="p">,</span>
<span class="n">learningOffset</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1024.0</span><span class="p">,</span>
<span class="n">learningDecay</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.51</span><span class="p">,</span>
<span class="n">subsamplingRate</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span>
<span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">docConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">topicDistributionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;topicDistribution&quot;</span><span class="p">,</span>
<span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, maxIter=20, seed=None, checkpointInterval=10,\</span>
<span class="sd"> k=10, optimizer=&quot;online&quot;, learningOffset=1024.0, learningDecay=0.51,\</span>
<span class="sd"> subsamplingRate=0.05, optimizeDocConcentration=True,\</span>
<span class="sd"> docConcentration=None, topicConcentration=None,\</span>
<span class="sd"> topicDistributionCol=&quot;topicDistribution&quot;, keepLastCheckpoint=True)</span>
<span class="sd"> Sets params for LDA.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setCheckpointInterval"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setCheckpointInterval">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setCheckpointInterval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`checkpointInterval`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">checkpointInterval</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setK(10)</span>
<span class="sd"> &gt;&gt;&gt; algo.getK()</span>
<span class="sd"> 10</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setOptimizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setOptimizer">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOptimizer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`optimizer`.</span>
<span class="sd"> Currently only support &#39;em&#39; and &#39;online&#39;.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setOptimizer(&quot;em&quot;)</span>
<span class="sd"> &gt;&gt;&gt; algo.getOptimizer()</span>
<span class="sd"> &#39;em&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">optimizer</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setLearningOffset"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setLearningOffset">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLearningOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`learningOffset`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setLearningOffset(100)</span>
<span class="sd"> &gt;&gt;&gt; algo.getLearningOffset()</span>
<span class="sd"> 100.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">learningOffset</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setLearningDecay"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setLearningDecay">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLearningDecay</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`learningDecay`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setLearningDecay(0.1)</span>
<span class="sd"> &gt;&gt;&gt; algo.getLearningDecay()</span>
<span class="sd"> 0.1...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">learningDecay</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setSubsamplingRate"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setSubsamplingRate">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSubsamplingRate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`subsamplingRate`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setSubsamplingRate(0.1)</span>
<span class="sd"> &gt;&gt;&gt; algo.getSubsamplingRate()</span>
<span class="sd"> 0.1...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">subsamplingRate</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setOptimizeDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setOptimizeDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOptimizeDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`optimizeDocConcentration`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setOptimizeDocConcentration(True)</span>
<span class="sd"> &gt;&gt;&gt; algo.getOptimizeDocConcentration()</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">optimizeDocConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`docConcentration`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setDocConcentration([0.1, 0.2])</span>
<span class="sd"> &gt;&gt;&gt; algo.getDocConcentration()</span>
<span class="sd"> [0.1..., 0.2...]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">docConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setTopicConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setTopicConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setTopicConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`topicConcentration`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setTopicConcentration(0.5)</span>
<span class="sd"> &gt;&gt;&gt; algo.getTopicConcentration()</span>
<span class="sd"> 0.5...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setTopicDistributionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setTopicDistributionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`topicDistributionCol`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setTopicDistributionCol(&quot;topicDistributionCol&quot;)</span>
<span class="sd"> &gt;&gt;&gt; algo.getTopicDistributionCol()</span>
<span class="sd"> &#39;topicDistributionCol&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicDistributionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setKeepLastCheckpoint"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setKeepLastCheckpoint">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setKeepLastCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`keepLastCheckpoint`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; algo = LDA().setKeepLastCheckpoint(False)</span>
<span class="sd"> &gt;&gt;&gt; algo.getKeepLastCheckpoint()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">keepLastCheckpoint</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="LDA.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LDA&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">_PowerIterationClusteringParams</span><span class="p">(</span><span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasWeightCol</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`PowerIterationClustering`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;k&quot;</span><span class="p">,</span>
<span class="s2">&quot;The number of clusters to create. Must be &gt; 1.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">initMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;initMode&quot;</span><span class="p">,</span>
<span class="s2">&quot;The initialization algorithm. This can be either &quot;</span>
<span class="o">+</span> <span class="s2">&quot;&#39;random&#39; to use a random vector as vertex properties, or &#39;degree&#39; to use &quot;</span>
<span class="o">+</span> <span class="s2">&quot;a normalized sum of similarities with other vertices. Supported options: &quot;</span>
<span class="o">+</span> <span class="s2">&quot;&#39;random&#39; and &#39;degree&#39;.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">srcCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;srcCol&quot;</span><span class="p">,</span>
<span class="s2">&quot;Name of the input column for source vertex IDs.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">dstCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span>
<span class="s2">&quot;dstCol&quot;</span><span class="p">,</span>
<span class="s2">&quot;Name of the input column for destination vertex IDs.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_PowerIterationClusteringParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">initMode</span><span class="o">=</span><span class="s2">&quot;random&quot;</span><span class="p">,</span> <span class="n">srcCol</span><span class="o">=</span><span class="s2">&quot;src&quot;</span><span class="p">,</span> <span class="n">dstCol</span><span class="o">=</span><span class="s2">&quot;dst&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`k` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`initMode` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initMode</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSrcCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`srcCol` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">srcCol</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDstCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`dstCol` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dstCol</span><span class="p">)</span>
<div class="viewcode-block" id="PowerIterationClustering"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PowerIterationClustering</span><span class="p">(</span>
<span class="n">_PowerIterationClusteringParams</span><span class="p">,</span>
<span class="n">JavaParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">],</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by</span>
<span class="sd"> `Lin and Cohen &lt;http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf&gt;`_. From the</span>
<span class="sd"> abstract: PIC finds a very low-dimensional embedding of a dataset using truncated power</span>
<span class="sd"> iteration on a normalized pair-wise similarity matrix of the data.</span>
<span class="sd"> This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method</span>
<span class="sd"> to run the PowerIterationClustering algorithm.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See `Wikipedia on Spectral clustering &lt;http://en.wikipedia.org/wiki/Spectral_clustering&gt;`_</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, 0, 0.5),</span>
<span class="sd"> ... (2, 0, 0.5), (2, 1, 0.7),</span>
<span class="sd"> ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),</span>
<span class="sd"> ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),</span>
<span class="sd"> ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data).toDF(&quot;src&quot;, &quot;dst&quot;, &quot;weight&quot;).repartition(1)</span>
<span class="sd"> &gt;&gt;&gt; pic = PowerIterationClustering(k=2, weightCol=&quot;weight&quot;)</span>
<span class="sd"> &gt;&gt;&gt; pic.setMaxIter(40)</span>
<span class="sd"> PowerIterationClustering...</span>
<span class="sd"> &gt;&gt;&gt; assignments = pic.assignClusters(df)</span>
<span class="sd"> &gt;&gt;&gt; assignments.sort(assignments.id).show(truncate=False)</span>
<span class="sd"> +---+-------+</span>
<span class="sd"> |id |cluster|</span>
<span class="sd"> +---+-------+</span>
<span class="sd"> |0 |0 |</span>
<span class="sd"> |1 |0 |</span>
<span class="sd"> |2 |0 |</span>
<span class="sd"> |3 |0 |</span>
<span class="sd"> |4 |0 |</span>
<span class="sd"> |5 |1 |</span>
<span class="sd"> +---+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; pic_path = temp_path + &quot;/pic&quot;</span>
<span class="sd"> &gt;&gt;&gt; pic.save(pic_path)</span>
<span class="sd"> &gt;&gt;&gt; pic2 = PowerIterationClustering.load(pic_path)</span>
<span class="sd"> &gt;&gt;&gt; pic2.getK()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; pic2.getMaxIter()</span>
<span class="sd"> 40</span>
<span class="sd"> &gt;&gt;&gt; pic2.assignClusters(df).take(6) == assignments.take(6)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;random&quot;</span><span class="p">,</span>
<span class="n">srcCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;src&quot;</span><span class="p">,</span>
<span class="n">dstCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;dst&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, k=2, maxIter=20, initMode=&quot;random&quot;, srcCol=&quot;src&quot;, dstCol=&quot;dst&quot;,\</span>
<span class="sd"> weightCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PowerIterationClustering</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.clustering.PowerIterationClustering&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span>
<span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="PowerIterationClustering.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span>
<span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;random&quot;</span><span class="p">,</span>
<span class="n">srcCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;src&quot;</span><span class="p">,</span>
<span class="n">dstCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;dst&quot;</span><span class="p">,</span>
<span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, k=2, maxIter=20, initMode=&quot;random&quot;, srcCol=&quot;src&quot;, dstCol=&quot;dst&quot;,\</span>
<span class="sd"> weightCol=None)</span>
<span class="sd"> Sets params for PowerIterationClustering.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setInitMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setInitMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`initMode`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setSrcCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setSrcCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSrcCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`srcCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">srcCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setDstCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setDstCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDstCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`dstCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dstCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PowerIterationClustering&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`weightCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PowerIterationClustering.assignClusters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.assignClusters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">assignClusters</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Run the PIC algorithm and returns a cluster assignment for each input vertex.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dataset : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A dataset with columns src, dst, weight representing the affinity matrix,</span>
<span class="sd"> which is the matrix A in the PIC paper. Suppose the src column value is i,</span>
<span class="sd"> the dst column value is j, the weight column value is similarity s,,ij,,</span>
<span class="sd"> which must be nonnegative. This is a symmetric matrix and hence</span>
<span class="sd"> s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be</span>
<span class="sd"> either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are</span>
<span class="sd"> ignored, because we assume s,,ij,, = 0.0.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A dataset that contains columns of vertex id and the corresponding cluster for</span>
<span class="sd"> the id. The schema of it will be:</span>
<span class="sd"> - id: Long</span>
<span class="sd"> - cluster: Int</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_transfer_params_to_java</span><span class="p">()</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">assignClusters</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">_jdf</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="n">dataset</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div></div>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">numpy</span>
<span class="kn">import</span> <span class="nn">pyspark.ml.clustering</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Numpy 1.14+ changed it&#39;s string format.</span>
<span class="n">numpy</span><span class="o">.</span><span class="n">set_printoptions</span><span class="p">(</span><span class="n">legacy</span><span class="o">=</span><span class="s2">&quot;1.13&quot;</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">clustering</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="c1"># The small batch size here ensures that we see multiple batches,</span>
<span class="c1"># even in these small test examples:</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[2]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;ml.clustering tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;sc&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="n">temp_path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;temp_path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp_path</span>
<span class="k">try</span><span class="p">:</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">finally</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">shutil</span> <span class="kn">import</span> <span class="n">rmtree</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">rmtree</span><span class="p">(</span><span class="n">temp_path</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>