| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.ml.clustering — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/ml/clustering';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/clustering.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/ml/clustering.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/ml/clustering.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.ml.clustering</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.ml.clustering</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">since</span><span class="p">,</span> <span class="n">keyword_only</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">HasMaxIter</span><span class="p">,</span> |
| <span class="n">HasFeaturesCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">HasPredictionCol</span><span class="p">,</span> |
| <span class="n">HasAggregationDepth</span><span class="p">,</span> |
| <span class="n">HasWeightCol</span><span class="p">,</span> |
| <span class="n">HasTol</span><span class="p">,</span> |
| <span class="n">HasProbabilityCol</span><span class="p">,</span> |
| <span class="n">HasDistanceMeasure</span><span class="p">,</span> |
| <span class="n">HasCheckpointInterval</span><span class="p">,</span> |
| <span class="n">HasSolver</span><span class="p">,</span> |
| <span class="n">HasMaxBlockSizeInMB</span><span class="p">,</span> |
| <span class="n">Param</span><span class="p">,</span> |
| <span class="n">Params</span><span class="p">,</span> |
| <span class="n">TypeConverters</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">,</span> |
| <span class="n">GeneralJavaMLWritable</span><span class="p">,</span> |
| <span class="n">HasTrainingSummary</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaParams</span><span class="p">,</span> <span class="n">JavaWrapper</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span><span class="p">,</span> <span class="n">_java2py</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.stat</span> <span class="kn">import</span> <span class="n">MultivariateGaussian</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vector</span><span class="p">,</span> <span class="n">Matrix</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml._typing</span> <span class="kn">import</span> <span class="n">M</span> |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="s2">"BisectingKMeans"</span><span class="p">,</span> |
| <span class="s2">"BisectingKMeansModel"</span><span class="p">,</span> |
| <span class="s2">"BisectingKMeansSummary"</span><span class="p">,</span> |
| <span class="s2">"KMeans"</span><span class="p">,</span> |
| <span class="s2">"KMeansModel"</span><span class="p">,</span> |
| <span class="s2">"KMeansSummary"</span><span class="p">,</span> |
| <span class="s2">"GaussianMixture"</span><span class="p">,</span> |
| <span class="s2">"GaussianMixtureModel"</span><span class="p">,</span> |
| <span class="s2">"GaussianMixtureSummary"</span><span class="p">,</span> |
| <span class="s2">"LDA"</span><span class="p">,</span> |
| <span class="s2">"LDAModel"</span><span class="p">,</span> |
| <span class="s2">"LocalLDAModel"</span><span class="p">,</span> |
| <span class="s2">"DistributedLDAModel"</span><span class="p">,</span> |
| <span class="s2">"PowerIterationClustering"</span><span class="p">,</span> |
| <span class="p">]</span> |
| |
| |
| <span class="k">class</span> <span class="nc">ClusteringSummary</span><span class="p">(</span><span class="n">JavaWrapper</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Clustering results for a given model.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Name for column of predicted clusters in `predictions`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predictionCol"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predictions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> DataFrame produced by the model's `transform` method.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predictions"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">featuresCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Name for column of features in `predictions`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"featuresCol"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> The number of clusters the model was trained with.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"k"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">cluster</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> DataFrame of predicted cluster centers for each training data point.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"cluster"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterSizes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Size of (number of data points in) each cluster.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"clusterSizes"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">numIter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Number of iterations.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"numIter"</span><span class="p">)</span> |
| |
| |
| <span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">_GaussianMixtureParams</span><span class="p">(</span> |
| <span class="n">HasMaxIter</span><span class="p">,</span> |
| <span class="n">HasFeaturesCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">HasPredictionCol</span><span class="p">,</span> |
| <span class="n">HasProbabilityCol</span><span class="p">,</span> |
| <span class="n">HasTol</span><span class="p">,</span> |
| <span class="n">HasAggregationDepth</span><span class="p">,</span> |
| <span class="n">HasWeightCol</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`GaussianMixture` and :py:class:`GaussianMixtureModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"Number of independent Gaussians in the mixture model. "</span> <span class="o">+</span> <span class="s2">"Must be > 1."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_GaussianMixtureParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">aggregationDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `k`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="GaussianMixtureModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel">[docs]</a><span class="k">class</span> <span class="nc">GaussianMixtureModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_GaussianMixtureParams</span><span class="p">,</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"GaussianMixtureModel"</span><span class="p">],</span> |
| <span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">"GaussianMixtureSummary"</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by GaussianMixture.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.setProbabilityCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.setProbabilityCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setProbabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`probabilityCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">probabilityCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">weights</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Weight for each Gaussian distribution in the mixture.</span> |
| <span class="sd"> This is a multinomial probability distribution over the k Gaussians,</span> |
| <span class="sd"> where weights[i] is the weight for Gaussian i, and weights sum to 1.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"weights"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">gaussians</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">MultivariateGaussian</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Array of :py:class:`MultivariateGaussian` where gaussians[i] represents</span> |
| <span class="sd"> the Multivariate Gaussian (Normal) Distribution for Gaussian i</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> |
| <span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">jgaussians</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">gaussians</span><span class="p">()</span> |
| <span class="k">return</span> <span class="p">[</span> |
| <span class="n">MultivariateGaussian</span><span class="p">(</span><span class="n">_java2py</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">jgaussian</span><span class="o">.</span><span class="n">mean</span><span class="p">()),</span> <span class="n">_java2py</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">jgaussian</span><span class="o">.</span><span class="n">cov</span><span class="p">()))</span> |
| <span class="k">for</span> <span class="n">jgaussian</span> <span class="ow">in</span> <span class="n">jgaussians</span> |
| <span class="p">]</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">gaussiansDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Retrieve Gaussian distributions as a DataFrame.</span> |
| <span class="sd"> Each row represents a Gaussian Distribution.</span> |
| <span class="sd"> The DataFrame has two columns: mean (Vector) and cov (Matrix).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"gaussiansDF"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureSummary"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span> |
| <span class="sd"> training set. An exception is thrown if no summary exists.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">GaussianMixtureSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">GaussianMixtureModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span> |
| <span class="s2">"No training summary available for this </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Predict label for the given features.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predict"</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.predictProbability"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureModel.html#pyspark.ml.clustering.GaussianMixtureModel.predictProbability">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predictProbability</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Predict probability for the given features.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predictProbability"</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="GaussianMixture"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">GaussianMixture</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="n">GaussianMixtureModel</span><span class="p">],</span> |
| <span class="n">_GaussianMixtureParams</span><span class="p">,</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"GaussianMixture"</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> GaussianMixture clustering.</span> |
| <span class="sd"> This class performs expectation maximization for multivariate Gaussian</span> |
| <span class="sd"> Mixture Models (GMMs). A GMM represents a composite distribution of</span> |
| <span class="sd"> independent Gaussian distributions with associated "mixing" weights</span> |
| <span class="sd"> specifying each's contribution to the composite.</span> |
| |
| <span class="sd"> Given a set of sample points, this class will maximize the log-likelihood</span> |
| <span class="sd"> for a mixture of k Gaussians, iterating until the log-likelihood changes by</span> |
| <span class="sd"> less than convergenceTol, or until it has reached the max number of iterations.</span> |
| <span class="sd"> While this process is generally guaranteed to converge, it is not guaranteed</span> |
| <span class="sd"> to find a global optimum.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> For high-dimensional data (with many features), this algorithm may perform poorly.</span> |
| <span class="sd"> This is due to high-dimensional data (a) making it difficult to cluster at all</span> |
| <span class="sd"> (based on statistical/theoretical arguments) and (b) numerical issues with</span> |
| <span class="sd"> Gaussian distributions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| |
| <span class="sd"> >>> data = [(Vectors.dense([-0.1, -0.05 ]),),</span> |
| <span class="sd"> ... (Vectors.dense([-0.01, -0.1]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.9, 0.8]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.75, 0.935]),),</span> |
| <span class="sd"> ... (Vectors.dense([-0.83, -0.68]),),</span> |
| <span class="sd"> ... (Vectors.dense([-0.91, -0.76]),)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["features"])</span> |
| <span class="sd"> >>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)</span> |
| <span class="sd"> >>> gm.getMaxIter()</span> |
| <span class="sd"> 100</span> |
| <span class="sd"> >>> gm.setMaxIter(30)</span> |
| <span class="sd"> GaussianMixture...</span> |
| <span class="sd"> >>> gm.getMaxIter()</span> |
| <span class="sd"> 30</span> |
| <span class="sd"> >>> model = gm.fit(df)</span> |
| <span class="sd"> >>> model.getAggregationDepth()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.getFeaturesCol()</span> |
| <span class="sd"> 'features'</span> |
| <span class="sd"> >>> model.setPredictionCol("newPrediction")</span> |
| <span class="sd"> GaussianMixtureModel...</span> |
| <span class="sd"> >>> model.predict(df.head().features)</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.predictProbability(df.head().features)</span> |
| <span class="sd"> DenseVector([0.0, 0.0, 1.0])</span> |
| <span class="sd"> >>> model.hasSummary</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> summary = model.summary</span> |
| <span class="sd"> >>> summary.k</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> >>> summary.clusterSizes</span> |
| <span class="sd"> [2, 2, 2]</span> |
| <span class="sd"> >>> weights = model.weights</span> |
| <span class="sd"> >>> len(weights)</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> >>> gaussians = model.gaussians</span> |
| <span class="sd"> >>> len(gaussians)</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> >>> gaussians[0].mean</span> |
| <span class="sd"> DenseVector([0.825, 0.8675])</span> |
| <span class="sd"> >>> gaussians[0].cov</span> |
| <span class="sd"> DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], 0)</span> |
| <span class="sd"> >>> gaussians[1].mean</span> |
| <span class="sd"> DenseVector([-0.87, -0.72])</span> |
| <span class="sd"> >>> gaussians[1].cov</span> |
| <span class="sd"> DenseMatrix(2, 2, [0.0016, 0.0016, 0.0016, 0.0016], 0)</span> |
| <span class="sd"> >>> gaussians[2].mean</span> |
| <span class="sd"> DenseVector([-0.055, -0.075])</span> |
| <span class="sd"> >>> gaussians[2].cov</span> |
| <span class="sd"> DenseMatrix(2, 2, [0.002, -0.0011, -0.0011, 0.0006], 0)</span> |
| <span class="sd"> >>> model.gaussiansDF.select("mean").head()</span> |
| <span class="sd"> Row(mean=DenseVector([0.825, 0.8675]))</span> |
| <span class="sd"> >>> model.gaussiansDF.select("cov").head()</span> |
| <span class="sd"> Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span> |
| <span class="sd"> >>> transformed = model.transform(df).select("features", "newPrediction")</span> |
| <span class="sd"> >>> rows = transformed.collect()</span> |
| <span class="sd"> >>> rows[4].newPrediction == rows[5].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> rows[2].newPrediction == rows[3].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> gmm_path = temp_path + "/gmm"</span> |
| <span class="sd"> >>> gm.save(gmm_path)</span> |
| <span class="sd"> >>> gm2 = GaussianMixture.load(gmm_path)</span> |
| <span class="sd"> >>> gm2.getK()</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> >>> model_path = temp_path + "/gmm_model"</span> |
| <span class="sd"> >>> model.save(model_path)</span> |
| <span class="sd"> >>> model2 = GaussianMixtureModel.load(model_path)</span> |
| <span class="sd"> >>> model2.hasSummary</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> model2.weights == model.weights</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[0].mean == model.gaussians[0].mean</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[0].cov == model.gaussians[0].cov</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[1].mean == model.gaussians[1].mean</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[1].cov == model.gaussians[1].cov</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[2].mean == model.gaussians[2].mean</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussians[2].cov == model.gaussians[2].cov</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model2.gaussiansDF.select("mean").head()</span> |
| <span class="sd"> Row(mean=DenseVector([0.825, 0.8675]))</span> |
| <span class="sd"> >>> model2.gaussiansDF.select("cov").head()</span> |
| <span class="sd"> Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span> |
| <span class="sd"> >>> model.transform(df).take(1) == model2.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> gm2.setWeightCol("weight")</span> |
| <span class="sd"> GaussianMixture...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">probabilityCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"probability"</span><span class="p">,</span> |
| <span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.01</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aggregationDepth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", predictionCol="prediction", k=2, \</span> |
| <span class="sd"> probabilityCol="probability", tol=0.01, maxIter=100, seed=None, \</span> |
| <span class="sd"> aggregationDepth=2, weightCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">GaussianMixture</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.clustering.GaussianMixture"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">GaussianMixtureModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="GaussianMixture.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">probabilityCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"probability"</span><span class="p">,</span> |
| <span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.01</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aggregationDepth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", predictionCol="prediction", k=2, \</span> |
| <span class="sd"> probabilityCol="probability", tol=0.01, maxIter=100, seed=None, \</span> |
| <span class="sd"> aggregationDepth=2, weightCol=None)</span> |
| |
| <span class="sd"> Sets params for GaussianMixture.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setProbabilityCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setProbabilityCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setProbabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`probabilityCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">probabilityCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`weightCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setTol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setTol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setTol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`tol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">tol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixture.setAggregationDepth"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixture.html#pyspark.ml.clustering.GaussianMixture.setAggregationDepth">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setAggregationDepth</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixture"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`aggregationDepth`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">aggregationDepth</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="GaussianMixtureSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.GaussianMixtureSummary.html#pyspark.ml.clustering.GaussianMixtureSummary">[docs]</a><span class="k">class</span> <span class="nc">GaussianMixtureSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gaussian mixture clustering results for a given model.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">probabilityCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Name for column of predicted probability of each cluster in `predictions`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"probabilityCol"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">probability</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> DataFrame of probabilities of each cluster for each training data point.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"probability"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">logLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Total log-likelihood for this model on the given data.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"logLikelihood"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="KMeansSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansSummary.html#pyspark.ml.clustering.KMeansSummary">[docs]</a><span class="k">class</span> <span class="nc">KMeansSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Summary of KMeans.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">trainingCost</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> K-means cost (sum of squared distances to the nearest centroid for all points in the</span> |
| <span class="sd"> training dataset). This is equivalent to sklearn's inertia.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"trainingCost"</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">_KMeansParams</span><span class="p">(</span> |
| <span class="n">HasMaxIter</span><span class="p">,</span> |
| <span class="n">HasFeaturesCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">HasPredictionCol</span><span class="p">,</span> |
| <span class="n">HasTol</span><span class="p">,</span> |
| <span class="n">HasDistanceMeasure</span><span class="p">,</span> |
| <span class="n">HasWeightCol</span><span class="p">,</span> |
| <span class="n">HasSolver</span><span class="p">,</span> |
| <span class="n">HasMaxBlockSizeInMB</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`KMeans` and :py:class:`KMeansModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"The number of clusters to create. Must be > 1."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"initMode"</span><span class="p">,</span> |
| <span class="s1">'The initialization algorithm. This can be either "random" to '</span> |
| <span class="o">+</span> <span class="s1">'choose random points as initial cluster centers, or "k-means||" '</span> |
| <span class="o">+</span> <span class="s2">"to use a parallel variant of k-means++"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">initSteps</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"initSteps"</span><span class="p">,</span> |
| <span class="s2">"The number of steps for k-means|| "</span> <span class="o">+</span> <span class="s2">"initialization mode. Must be > 0."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">solver</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"solver"</span><span class="p">,</span> |
| <span class="s2">"The solver algorithm for optimization. Supported "</span> <span class="o">+</span> <span class="s2">"options: auto, row, block."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_KMeansParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="o">=</span><span class="s2">"k-means||"</span><span class="p">,</span> |
| <span class="n">initSteps</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> |
| <span class="n">tol</span><span class="o">=</span><span class="mf">1e-4</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="o">=</span><span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="n">solver</span><span class="o">=</span><span class="s2">"auto"</span><span class="p">,</span> |
| <span class="n">maxBlockSizeInMB</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `k`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `initMode`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initMode</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getInitSteps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `initSteps`</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initSteps</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="KMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel">[docs]</a><span class="k">class</span> <span class="nc">KMeansModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_KMeansParams</span><span class="p">,</span> |
| <span class="n">GeneralJavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"KMeansModel"</span><span class="p">],</span> |
| <span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">"KMeansSummary"</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by KMeans.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="KMeansModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeansModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeansModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeansModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeansModel.clusterCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.clusterCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Get the cluster centers, represented as a list of NumPy arrays."""</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"clusterCenters"</span><span class="p">)]</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">KMeansSummary</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span> |
| <span class="sd"> training set. An exception is thrown if no summary exists.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">KMeansSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">KMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span> |
| <span class="s2">"No training summary available for this </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="KMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeansModel.html#pyspark.ml.clustering.KMeansModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Predict label for the given features.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predict"</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="KMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">KMeans</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">KMeansModel</span><span class="p">],</span> <span class="n">_KMeansParams</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"KMeans"</span><span class="p">]):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> K-means clustering with a k-means++ like initialization mode</span> |
| <span class="sd"> (the k-means|| algorithm by Bahmani et al).</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),</span> |
| <span class="sd"> ... (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["features", "weighCol"])</span> |
| <span class="sd"> >>> kmeans = KMeans(k=2)</span> |
| <span class="sd"> >>> kmeans.setSeed(1)</span> |
| <span class="sd"> KMeans...</span> |
| <span class="sd"> >>> kmeans.setWeightCol("weighCol")</span> |
| <span class="sd"> KMeans...</span> |
| <span class="sd"> >>> kmeans.setMaxIter(10)</span> |
| <span class="sd"> KMeans...</span> |
| <span class="sd"> >>> kmeans.getMaxIter()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> kmeans.clear(kmeans.maxIter)</span> |
| <span class="sd"> >>> kmeans.getSolver()</span> |
| <span class="sd"> 'auto'</span> |
| <span class="sd"> >>> model = kmeans.fit(df)</span> |
| <span class="sd"> >>> model.getMaxBlockSizeInMB()</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> >>> model.getDistanceMeasure()</span> |
| <span class="sd"> 'euclidean'</span> |
| <span class="sd"> >>> model.setPredictionCol("newPrediction")</span> |
| <span class="sd"> KMeansModel...</span> |
| <span class="sd"> >>> model.predict(df.head().features)</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> centers = model.clusterCenters()</span> |
| <span class="sd"> >>> len(centers)</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> transformed = model.transform(df).select("features", "newPrediction")</span> |
| <span class="sd"> >>> rows = transformed.collect()</span> |
| <span class="sd"> >>> rows[0].newPrediction == rows[1].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> rows[2].newPrediction == rows[3].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.hasSummary</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> summary = model.summary</span> |
| <span class="sd"> >>> summary.k</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> summary.clusterSizes</span> |
| <span class="sd"> [2, 2]</span> |
| <span class="sd"> >>> summary.trainingCost</span> |
| <span class="sd"> 4.0</span> |
| <span class="sd"> >>> kmeans_path = temp_path + "/kmeans"</span> |
| <span class="sd"> >>> kmeans.save(kmeans_path)</span> |
| <span class="sd"> >>> kmeans2 = KMeans.load(kmeans_path)</span> |
| <span class="sd"> >>> kmeans2.getK()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model_path = temp_path + "/kmeans_model"</span> |
| <span class="sd"> >>> model.save(model_path)</span> |
| <span class="sd"> >>> model2 = KMeansModel.load(model_path)</span> |
| <span class="sd"> >>> model2.hasSummary</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> model.clusterCenters()[0] == model2.clusterCenters()[0]</span> |
| <span class="sd"> array([ True, True], dtype=bool)</span> |
| <span class="sd"> >>> model.clusterCenters()[1] == model2.clusterCenters()[1]</span> |
| <span class="sd"> array([ True, True], dtype=bool)</span> |
| <span class="sd"> >>> model.transform(df).take(1) == model2.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"k-means||"</span><span class="p">,</span> |
| <span class="n">initSteps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-4</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">solver</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"auto"</span><span class="p">,</span> |
| <span class="n">maxBlockSizeInMB</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", predictionCol="prediction", k=2, \</span> |
| <span class="sd"> initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \</span> |
| <span class="sd"> distanceMeasure="euclidean", weightCol=None, solver="auto", \</span> |
| <span class="sd"> maxBlockSizeInMB=0.0)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">KMeans</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.clustering.KMeans"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="n">KMeansModel</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">KMeansModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="KMeans.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"k-means||"</span><span class="p">,</span> |
| <span class="n">initSteps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">tol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-4</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">solver</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"auto"</span><span class="p">,</span> |
| <span class="n">maxBlockSizeInMB</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", predictionCol="prediction", k=2, \</span> |
| <span class="sd"> initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \</span> |
| <span class="sd"> distanceMeasure="euclidean", weightCol=None, solver="auto", \</span> |
| <span class="sd"> maxBlockSizeInMB=0.0)</span> |
| |
| <span class="sd"> Sets params for KMeans.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setInitMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setInitMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`initMode`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setInitSteps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setInitSteps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInitSteps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`initSteps`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initSteps</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setDistanceMeasure"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setDistanceMeasure">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDistanceMeasure</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`distanceMeasure`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">distanceMeasure</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setTol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setTol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setTol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`tol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">tol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`weightCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setSolver"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setSolver">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSolver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`solver`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">solver</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeans.setMaxBlockSizeInMB"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans.setMaxBlockSizeInMB">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxBlockSizeInMB</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxBlockSizeInMB`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxBlockSizeInMB</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">_BisectingKMeansParams</span><span class="p">(</span> |
| <span class="n">HasMaxIter</span><span class="p">,</span> |
| <span class="n">HasFeaturesCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">HasPredictionCol</span><span class="p">,</span> |
| <span class="n">HasDistanceMeasure</span><span class="p">,</span> |
| <span class="n">HasWeightCol</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`BisectingKMeans` and :py:class:`BisectingKMeansModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"The desired number of leaf clusters. Must be > 1."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minDivisibleClusterSize"</span><span class="p">,</span> |
| <span class="s2">"The minimum number of points (if >= 1.0) or the minimum "</span> |
| <span class="o">+</span> <span class="s2">"proportion of points (if < 1.0) of a divisible cluster."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_BisectingKMeansParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">minDivisibleClusterSize</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `k` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinDivisibleClusterSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of `minDivisibleClusterSize` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDivisibleClusterSize</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="BisectingKMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel">[docs]</a><span class="k">class</span> <span class="nc">BisectingKMeansModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_BisectingKMeansParams</span><span class="p">,</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"BisectingKMeansModel"</span><span class="p">],</span> |
| <span class="n">HasTrainingSummary</span><span class="p">[</span><span class="s2">"BisectingKMeansSummary"</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by BisectingKMeans.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeansModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeansModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.clusterCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.clusterCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Get the cluster centers, represented as a list of NumPy arrays."""</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"clusterCenters"</span><span class="p">)]</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.computeCost"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.computeCost">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">computeCost</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the sum of squared distances between the input points</span> |
| <span class="sd"> and their corresponding cluster centers.</span> |
| |
| <span class="sd"> .. deprecated:: 3.0.0</span> |
| <span class="sd"> It will be removed in future versions. Use :py:class:`ClusteringEvaluator` instead.</span> |
| <span class="sd"> You can also get the cost on the training dataset in the summary.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"Deprecated in 3.0.0. It will be removed in future versions. Use "</span> |
| <span class="s2">"ClusteringEvaluator instead. You can also get the cost on the training "</span> |
| <span class="s2">"dataset in the summary."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"computeCost"</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeansSummary"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets summary (cluster assignments, cluster sizes) of the model trained on the</span> |
| <span class="sd"> training set. An exception is thrown if no summary exists.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hasSummary</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">BisectingKMeansSummary</span><span class="p">(</span><span class="nb">super</span><span class="p">(</span><span class="n">BisectingKMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span> |
| <span class="s2">"No training summary available for this </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansModel.html#pyspark.ml.clustering.BisectingKMeansModel.predict">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Predict label for the given features.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"predict"</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="BisectingKMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">BisectingKMeans</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="n">BisectingKMeansModel</span><span class="p">],</span> |
| <span class="n">_BisectingKMeansParams</span><span class="p">,</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"BisectingKMeans"</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A bisecting k-means algorithm based on the paper "A comparison of document clustering</span> |
| <span class="sd"> techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark.</span> |
| <span class="sd"> The algorithm starts from a single cluster that contains all points.</span> |
| <span class="sd"> Iteratively it finds divisible clusters on the bottom level and bisects each of them using</span> |
| <span class="sd"> k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible.</span> |
| <span class="sd"> The bisecting steps of clusters on the same level are grouped together to increase parallelism.</span> |
| <span class="sd"> If bisecting all divisible clusters on the bottom level would result more than `k` leaf</span> |
| <span class="sd"> clusters, larger clusters get higher priority.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),</span> |
| <span class="sd"> ... (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["features", "weighCol"])</span> |
| <span class="sd"> >>> bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)</span> |
| <span class="sd"> >>> bkm.setMaxIter(10)</span> |
| <span class="sd"> BisectingKMeans...</span> |
| <span class="sd"> >>> bkm.getMaxIter()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> bkm.clear(bkm.maxIter)</span> |
| <span class="sd"> >>> bkm.setSeed(1)</span> |
| <span class="sd"> BisectingKMeans...</span> |
| <span class="sd"> >>> bkm.setWeightCol("weighCol")</span> |
| <span class="sd"> BisectingKMeans...</span> |
| <span class="sd"> >>> bkm.getSeed()</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> >>> bkm.clear(bkm.seed)</span> |
| <span class="sd"> >>> model = bkm.fit(df)</span> |
| <span class="sd"> >>> model.getMaxIter()</span> |
| <span class="sd"> 20</span> |
| <span class="sd"> >>> model.setPredictionCol("newPrediction")</span> |
| <span class="sd"> BisectingKMeansModel...</span> |
| <span class="sd"> >>> model.predict(df.head().features)</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> centers = model.clusterCenters()</span> |
| <span class="sd"> >>> len(centers)</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.computeCost(df)</span> |
| <span class="sd"> 2.0</span> |
| <span class="sd"> >>> model.hasSummary</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> summary = model.summary</span> |
| <span class="sd"> >>> summary.k</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> summary.clusterSizes</span> |
| <span class="sd"> [2, 2]</span> |
| <span class="sd"> >>> summary.trainingCost</span> |
| <span class="sd"> 4.000...</span> |
| <span class="sd"> >>> transformed = model.transform(df).select("features", "newPrediction")</span> |
| <span class="sd"> >>> rows = transformed.collect()</span> |
| <span class="sd"> >>> rows[0].newPrediction == rows[1].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> rows[2].newPrediction == rows[3].newPrediction</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> bkm_path = temp_path + "/bkm"</span> |
| <span class="sd"> >>> bkm.save(bkm_path)</span> |
| <span class="sd"> >>> bkm2 = BisectingKMeans.load(bkm_path)</span> |
| <span class="sd"> >>> bkm2.getK()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> bkm2.getDistanceMeasure()</span> |
| <span class="sd"> 'euclidean'</span> |
| <span class="sd"> >>> model_path = temp_path + "/bkm_model"</span> |
| <span class="sd"> >>> model.save(model_path)</span> |
| <span class="sd"> >>> model2 = BisectingKMeansModel.load(model_path)</span> |
| <span class="sd"> >>> model2.hasSummary</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> model.clusterCenters()[0] == model2.clusterCenters()[0]</span> |
| <span class="sd"> array([ True, True], dtype=bool)</span> |
| <span class="sd"> >>> model.clusterCenters()[1] == model2.clusterCenters()[1]</span> |
| <span class="sd"> array([ True, True], dtype=bool)</span> |
| <span class="sd"> >>> model.transform(df).take(1) == model2.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span> |
| <span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", predictionCol="prediction", maxIter=20, \</span> |
| <span class="sd"> seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean", \</span> |
| <span class="sd"> weightCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">BisectingKMeans</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.clustering.BisectingKMeans"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">predictionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"prediction"</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span> |
| <span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", predictionCol="prediction", maxIter=20, \</span> |
| <span class="sd"> seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean", \</span> |
| <span class="sd"> weightCol=None)</span> |
| <span class="sd"> Sets params for BisectingKMeans.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setMinDivisibleClusterSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setMinDivisibleClusterSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinDivisibleClusterSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minDivisibleClusterSize`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDivisibleClusterSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setDistanceMeasure"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setDistanceMeasure">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDistanceMeasure</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`distanceMeasure`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">distanceMeasure</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setPredictionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setPredictionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPredictionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`predictionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeans.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeans.html#pyspark.ml.clustering.BisectingKMeans.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BisectingKMeans"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`weightCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="n">BisectingKMeansModel</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">BisectingKMeansModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="BisectingKMeansSummary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.BisectingKMeansSummary.html#pyspark.ml.clustering.BisectingKMeansSummary">[docs]</a><span class="k">class</span> <span class="nc">BisectingKMeansSummary</span><span class="p">(</span><span class="n">ClusteringSummary</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Bisecting KMeans clustering results for a given model.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">trainingCost</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sum of squared distances to the nearest centroid for all points in the training dataset.</span> |
| <span class="sd"> This is equivalent to sklearn's inertia.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"trainingCost"</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">_LDAParams</span><span class="p">(</span><span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasCheckpointInterval</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`LDA` and :py:class:`LDAModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"The number of topics (clusters) to infer. Must be > 1."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">optimizer</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"optimizer"</span><span class="p">,</span> |
| <span class="s2">"Optimizer or inference algorithm used to estimate the LDA model. "</span> |
| <span class="s2">"Supported: online, em"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">learningOffset</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"learningOffset"</span><span class="p">,</span> |
| <span class="s2">"A (positive) learning parameter that downweights early iterations."</span> |
| <span class="s2">" Larger values make early iterations count less"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">learningDecay</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"learningDecay"</span><span class="p">,</span> |
| <span class="s2">"Learning rate, set as an"</span> |
| <span class="s2">"exponential decay rate. This should be between (0.5, 1.0] to "</span> |
| <span class="s2">"guarantee asymptotic convergence."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">subsamplingRate</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"subsamplingRate"</span><span class="p">,</span> |
| <span class="s2">"Fraction of the corpus to be sampled and used in each iteration "</span> |
| <span class="s2">"of mini-batch gradient descent, in range (0, 1]."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"optimizeDocConcentration"</span><span class="p">,</span> |
| <span class="s2">"Indicates whether the docConcentration (Dirichlet parameter "</span> |
| <span class="s2">"for document-topic distribution) will be optimized during "</span> |
| <span class="s2">"training."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">docConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"docConcentration"</span><span class="p">,</span> |
| <span class="s1">'Concentration parameter (commonly named "alpha") for the '</span> |
| <span class="s1">'prior placed on documents</span><span class="se">\'</span><span class="s1"> distributions over topics ("theta").'</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"topicConcentration"</span><span class="p">,</span> |
| <span class="s1">'Concentration parameter (commonly named "beta" or "eta") for '</span> |
| <span class="s2">"the prior placed on topic' distributions over terms."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">topicDistributionCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"topicDistributionCol"</span><span class="p">,</span> |
| <span class="s2">"Output column with estimates of the topic mixture distribution "</span> |
| <span class="s1">'for each document (often called "theta" in the literature). '</span> |
| <span class="s2">"Returns a vector of zeros for an empty document."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"keepLastCheckpoint"</span><span class="p">,</span> |
| <span class="s2">"(For EM optimizer) If using checkpointing, this indicates whether"</span> |
| <span class="s2">" to keep the last checkpoint. If false, then the checkpoint will be"</span> |
| <span class="s2">" deleted. Deleting the checkpoint can cause failures if a data"</span> |
| <span class="s2">" partition is lost, so set this bit with care."</span><span class="p">,</span> |
| <span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_LDAParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> |
| <span class="n">checkpointInterval</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> |
| <span class="n">k</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> |
| <span class="n">optimizer</span><span class="o">=</span><span class="s2">"online"</span><span class="p">,</span> |
| <span class="n">learningOffset</span><span class="o">=</span><span class="mf">1024.0</span><span class="p">,</span> |
| <span class="n">learningDecay</span><span class="o">=</span><span class="mf">0.51</span><span class="p">,</span> |
| <span class="n">subsamplingRate</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">optimizeDocConcentration</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">topicDistributionCol</span><span class="o">=</span><span class="s2">"topicDistribution"</span><span class="p">,</span> |
| <span class="n">keepLastCheckpoint</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`k` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getOptimizer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`optimizer` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLearningOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`learningOffset` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">learningOffset</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLearningDecay</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`learningDecay` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">learningDecay</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSubsamplingRate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`subsamplingRate` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">subsamplingRate</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getOptimizeDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`optimizeDocConcentration` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">optimizeDocConcentration</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`docConcentration` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">docConcentration</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getTopicConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`topicConcentration` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">topicConcentration</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`topicDistributionCol` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">topicDistributionCol</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getKeepLastCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`keepLastCheckpoint` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keepLastCheckpoint</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="LDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">LDAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_LDAParams</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Latent Dirichlet Allocation (LDA) model.</span> |
| <span class="sd"> This abstraction permits for different underlying representations,</span> |
| <span class="sd"> including local and distributed data structures.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="LDAModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"M"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"M"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"M"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"M"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.setTopicDistributionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.setTopicDistributionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"M"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"M"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`topicDistributionCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicDistributionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.isDistributed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.isDistributed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">isDistributed</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Indicates whether this instance is of type DistributedLDAModel</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"isDistributed"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.vocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.vocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">vocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Vocabulary size (number of terms or words in the vocabulary)"""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"vocabSize"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.topicsMatrix"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.topicsMatrix">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">topicsMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Matrix</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Inferred topics, where each topic is represented by a distribution over terms.</span> |
| <span class="sd"> This is a matrix of size vocabSize x k, where each column is a topic.</span> |
| <span class="sd"> No guarantees are given about the ordering of the topics.</span> |
| |
| <span class="sd"> .. warning:: If this model is actually a :py:class:`DistributedLDAModel`</span> |
| <span class="sd"> instance produced by the Expectation-Maximization ("em") `optimizer`,</span> |
| <span class="sd"> then this method could involve collecting a large amount of data</span> |
| <span class="sd"> to the driver (on the order of vocabSize x k).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"topicsMatrix"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.logLikelihood"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.logLikelihood">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">logLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates a lower bound on the log likelihood of the entire corpus.</span> |
| <span class="sd"> See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</span> |
| |
| <span class="sd"> .. warning:: If this model is an instance of :py:class:`DistributedLDAModel` (produced when</span> |
| <span class="sd"> :py:attr:`optimizer` is set to "em"), this involves collecting a large</span> |
| <span class="sd"> :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"logLikelihood"</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.logPerplexity"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.logPerplexity">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">logPerplexity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculate an upper bound on perplexity. (Lower is better.)</span> |
| <span class="sd"> See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</span> |
| |
| <span class="sd"> .. warning:: If this model is an instance of :py:class:`DistributedLDAModel` (produced when</span> |
| <span class="sd"> :py:attr:`optimizer` is set to "em"), this involves collecting a large</span> |
| <span class="sd"> :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"logPerplexity"</span><span class="p">,</span> <span class="n">dataset</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.describeTopics"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.describeTopics">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">describeTopics</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the topics described by their top-weighted terms.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"describeTopics"</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.estimatedDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDAModel.html#pyspark.ml.clustering.LDAModel.estimatedDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">estimatedDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Value for :py:attr:`LDA.docConcentration` estimated from data.</span> |
| <span class="sd"> If Online LDA was used and :py:attr:`LDA.optimizeDocConcentration` was set to false,</span> |
| <span class="sd"> then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"estimatedDocConcentration"</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="DistributedLDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">DistributedLDAModel</span><span class="p">(</span><span class="n">LDAModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"DistributedLDAModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Distributed model fitted by :py:class:`LDA`.</span> |
| <span class="sd"> This type of model is currently only produced by Expectation-Maximization (EM).</span> |
| |
| <span class="sd"> This model stores the inferred topics, the full training dataset, and the topic distribution</span> |
| <span class="sd"> for each training document.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="DistributedLDAModel.toLocal"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.toLocal">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">toLocal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LocalLDAModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert this distributed model to a local representation. This discards info about the</span> |
| <span class="sd"> training dataset.</span> |
| |
| <span class="sd"> .. warning:: This involves collecting a large :py:func:`topicsMatrix` to the driver.</span> |
| <span class="sd"> """</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">LocalLDAModel</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"toLocal"</span><span class="p">))</span> |
| |
| <span class="c1"># SPARK-10931: Temporary fix to be removed once LDAModel defines Params</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">_create_params_from_java</span><span class="p">()</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">_transfer_params_from_java</span><span class="p">()</span> |
| |
| <span class="k">return</span> <span class="n">model</span></div> |
| |
| <div class="viewcode-block" id="DistributedLDAModel.trainingLogLikelihood"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.trainingLogLikelihood">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">trainingLogLikelihood</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Log likelihood of the observed tokens in the training set,</span> |
| <span class="sd"> given the current parameter estimates:</span> |
| <span class="sd"> log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> - This excludes the prior; for that, use :py:func:`logPrior`.</span> |
| <span class="sd"> - Even with :py:func:`logPrior`, this is NOT the same as the data log likelihood given</span> |
| <span class="sd"> the hyperparameters.</span> |
| <span class="sd"> - This is computed from the topic distributions computed during training. If you call</span> |
| <span class="sd"> :py:func:`logLikelihood` on the same training dataset, the topic distributions</span> |
| <span class="sd"> will be computed again, possibly giving different results.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"trainingLogLikelihood"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DistributedLDAModel.logPrior"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.logPrior">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">logPrior</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Log probability of the current parameter estimate:</span> |
| <span class="sd"> log P(topics, topic distributions for docs | alpha, eta)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"logPrior"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DistributedLDAModel.getCheckpointFiles"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.DistributedLDAModel.html#pyspark.ml.clustering.DistributedLDAModel.getCheckpointFiles">[docs]</a> <span class="k">def</span> <span class="nf">getCheckpointFiles</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may</span> |
| <span class="sd"> be saved checkpoint files. This method is provided so that users can manage those files.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of checkpoint files from training</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Removing the checkpoints can cause failures if a partition is lost and is needed</span> |
| <span class="sd"> by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up</span> |
| <span class="sd"> the checkpoints when this model and derivative data go out of scope.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"getCheckpointFiles"</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="LocalLDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LocalLDAModel.html#pyspark.ml.clustering.LocalLDAModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">LocalLDAModel</span><span class="p">(</span><span class="n">LDAModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"LocalLDAModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Local (non-distributed) model fitted by :py:class:`LDA`.</span> |
| <span class="sd"> This model stores the inferred topics only; it does not store info about the training dataset.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">pass</span></div> |
| |
| |
| <div class="viewcode-block" id="LDA"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">LDA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">LDAModel</span><span class="p">],</span> <span class="n">_LDAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"LDA"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Latent Dirichlet Allocation (LDA), a topic model designed for text documents.</span> |
| |
| <span class="sd"> Terminology:</span> |
| |
| <span class="sd"> - "term" = "word": an element of the vocabulary</span> |
| <span class="sd"> - "token": instance of a term appearing in a document</span> |
| <span class="sd"> - "topic": multinomial distribution over terms representing some concept</span> |
| <span class="sd"> - "document": one piece of text, corresponding to one row in the input data</span> |
| |
| <span class="sd"> Original LDA paper (journal version):</span> |
| <span class="sd"> Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.</span> |
| |
| <span class="sd"> Input data (featuresCol):</span> |
| <span class="sd"> LDA is given a collection of documents as input data, via the featuresCol parameter.</span> |
| <span class="sd"> Each document is specified as a :py:class:`Vector` of length vocabSize, where each entry is the</span> |
| <span class="sd"> count for the corresponding term (word) in the document. Feature transformers such as</span> |
| <span class="sd"> :py:class:`pyspark.ml.feature.Tokenizer` and :py:class:`pyspark.ml.feature.CountVectorizer`</span> |
| <span class="sd"> can be useful for converting text to word count vectors.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors, SparseVector</span> |
| <span class="sd"> >>> from pyspark.ml.clustering import LDA</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])],</span> |
| <span class="sd"> ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"])</span> |
| <span class="sd"> >>> lda = LDA(k=2, seed=1, optimizer="em")</span> |
| <span class="sd"> >>> lda.setMaxIter(10)</span> |
| <span class="sd"> LDA...</span> |
| <span class="sd"> >>> lda.getMaxIter()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> lda.clear(lda.maxIter)</span> |
| <span class="sd"> >>> model = lda.fit(df)</span> |
| <span class="sd"> >>> model.setSeed(1)</span> |
| <span class="sd"> DistributedLDAModel...</span> |
| <span class="sd"> >>> model.getTopicDistributionCol()</span> |
| <span class="sd"> 'topicDistribution'</span> |
| <span class="sd"> >>> model.isDistributed()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> localModel = model.toLocal()</span> |
| <span class="sd"> >>> localModel.isDistributed()</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> model.vocabSize()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.describeTopics().show()</span> |
| <span class="sd"> +-----+-----------+--------------------+</span> |
| <span class="sd"> |topic|termIndices| termWeights|</span> |
| <span class="sd"> +-----+-----------+--------------------+</span> |
| <span class="sd"> | 0| [1, 0]|[0.50401530077160...|</span> |
| <span class="sd"> | 1| [0, 1]|[0.50401530077160...|</span> |
| <span class="sd"> +-----+-----------+--------------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model.topicsMatrix()</span> |
| <span class="sd"> DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)</span> |
| <span class="sd"> >>> lda_path = temp_path + "/lda"</span> |
| <span class="sd"> >>> lda.save(lda_path)</span> |
| <span class="sd"> >>> sameLDA = LDA.load(lda_path)</span> |
| <span class="sd"> >>> distributed_model_path = temp_path + "/lda_distributed_model"</span> |
| <span class="sd"> >>> model.save(distributed_model_path)</span> |
| <span class="sd"> >>> sameModel = DistributedLDAModel.load(distributed_model_path)</span> |
| <span class="sd"> >>> local_model_path = temp_path + "/lda_local_model"</span> |
| <span class="sd"> >>> localModel.save(local_model_path)</span> |
| <span class="sd"> >>> sameLocalModel = LocalLDAModel.load(local_model_path)</span> |
| <span class="sd"> >>> model.transform(df).take(1) == sameLocalModel.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">checkpointInterval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">optimizer</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"online"</span><span class="p">,</span> |
| <span class="n">learningOffset</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1024.0</span><span class="p">,</span> |
| <span class="n">learningDecay</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.51</span><span class="p">,</span> |
| <span class="n">subsamplingRate</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">docConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">topicDistributionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"topicDistribution"</span><span class="p">,</span> |
| <span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10,\</span> |
| <span class="sd"> k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\</span> |
| <span class="sd"> subsamplingRate=0.05, optimizeDocConcentration=True,\</span> |
| <span class="sd"> docConcentration=None, topicConcentration=None,\</span> |
| <span class="sd"> topicDistributionCol="topicDistribution", keepLastCheckpoint=True)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">LDA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.clustering.LDA"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="n">LDAModel</span><span class="p">:</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOptimizer</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"em"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DistributedLDAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">LocalLDAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="LDA.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">checkpointInterval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">optimizer</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"online"</span><span class="p">,</span> |
| <span class="n">learningOffset</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1024.0</span><span class="p">,</span> |
| <span class="n">learningDecay</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.51</span><span class="p">,</span> |
| <span class="n">subsamplingRate</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">optimizeDocConcentration</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">docConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">topicConcentration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">topicDistributionCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"topicDistribution"</span><span class="p">,</span> |
| <span class="n">keepLastCheckpoint</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10,\</span> |
| <span class="sd"> k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\</span> |
| <span class="sd"> subsamplingRate=0.05, optimizeDocConcentration=True,\</span> |
| <span class="sd"> docConcentration=None, topicConcentration=None,\</span> |
| <span class="sd"> topicDistributionCol="topicDistribution", keepLastCheckpoint=True)</span> |
| |
| <span class="sd"> Sets params for LDA.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setCheckpointInterval"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setCheckpointInterval">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setCheckpointInterval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`checkpointInterval`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">checkpointInterval</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setSeed">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| |
| <span class="sd"> >>> algo = LDA().setK(10)</span> |
| <span class="sd"> >>> algo.getK()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setOptimizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setOptimizer">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOptimizer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`optimizer`.</span> |
| <span class="sd"> Currently only support 'em' and 'online'.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setOptimizer("em")</span> |
| <span class="sd"> >>> algo.getOptimizer()</span> |
| <span class="sd"> 'em'</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">optimizer</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setLearningOffset"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setLearningOffset">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLearningOffset</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`learningOffset`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setLearningOffset(100)</span> |
| <span class="sd"> >>> algo.getLearningOffset()</span> |
| <span class="sd"> 100.0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">learningOffset</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setLearningDecay"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setLearningDecay">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLearningDecay</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`learningDecay`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setLearningDecay(0.1)</span> |
| <span class="sd"> >>> algo.getLearningDecay()</span> |
| <span class="sd"> 0.1...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">learningDecay</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setSubsamplingRate"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setSubsamplingRate">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSubsamplingRate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`subsamplingRate`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setSubsamplingRate(0.1)</span> |
| <span class="sd"> >>> algo.getSubsamplingRate()</span> |
| <span class="sd"> 0.1...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">subsamplingRate</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setOptimizeDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setOptimizeDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOptimizeDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`optimizeDocConcentration`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setOptimizeDocConcentration(True)</span> |
| <span class="sd"> >>> algo.getOptimizeDocConcentration()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">optimizeDocConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setDocConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setDocConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDocConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`docConcentration`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setDocConcentration([0.1, 0.2])</span> |
| <span class="sd"> >>> algo.getDocConcentration()</span> |
| <span class="sd"> [0.1..., 0.2...]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">docConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setTopicConcentration"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setTopicConcentration">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setTopicConcentration</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`topicConcentration`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setTopicConcentration(0.5)</span> |
| <span class="sd"> >>> algo.getTopicConcentration()</span> |
| <span class="sd"> 0.5...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicConcentration</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setTopicDistributionCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setTopicDistributionCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setTopicDistributionCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`topicDistributionCol`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setTopicDistributionCol("topicDistributionCol")</span> |
| <span class="sd"> >>> algo.getTopicDistributionCol()</span> |
| <span class="sd"> 'topicDistributionCol'</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">topicDistributionCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setKeepLastCheckpoint"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setKeepLastCheckpoint">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setKeepLastCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`keepLastCheckpoint`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> algo = LDA().setKeepLastCheckpoint(False)</span> |
| <span class="sd"> >>> algo.getKeepLastCheckpoint()</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">keepLastCheckpoint</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDA.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.LDA.html#pyspark.ml.clustering.LDA.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">_PowerIterationClusteringParams</span><span class="p">(</span><span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasWeightCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`PowerIterationClustering`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"The number of clusters to create. Must be > 1."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"initMode"</span><span class="p">,</span> |
| <span class="s2">"The initialization algorithm. This can be either "</span> |
| <span class="o">+</span> <span class="s2">"'random' to use a random vector as vertex properties, or 'degree' to use "</span> |
| <span class="o">+</span> <span class="s2">"a normalized sum of similarities with other vertices. Supported options: "</span> |
| <span class="o">+</span> <span class="s2">"'random' and 'degree'."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">srcCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"srcCol"</span><span class="p">,</span> |
| <span class="s2">"Name of the input column for source vertex IDs."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">dstCol</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"dstCol"</span><span class="p">,</span> |
| <span class="s2">"Name of the input column for destination vertex IDs."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_PowerIterationClusteringParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">initMode</span><span class="o">=</span><span class="s2">"random"</span><span class="p">,</span> <span class="n">srcCol</span><span class="o">=</span><span class="s2">"src"</span><span class="p">,</span> <span class="n">dstCol</span><span class="o">=</span><span class="s2">"dst"</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`k` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`initMode` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">initMode</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSrcCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`srcCol` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">srcCol</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getDstCol</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`dstCol` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dstCol</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="PowerIterationClustering"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">PowerIterationClustering</span><span class="p">(</span> |
| <span class="n">_PowerIterationClusteringParams</span><span class="p">,</span> |
| <span class="n">JavaParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"PowerIterationClustering"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by</span> |
| <span class="sd"> `Lin and Cohen <http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf>`_. From the</span> |
| <span class="sd"> abstract: PIC finds a very low-dimensional embedding of a dataset using truncated power</span> |
| <span class="sd"> iteration on a normalized pair-wise similarity matrix of the data.</span> |
| |
| <span class="sd"> This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method</span> |
| <span class="sd"> to run the PowerIterationClustering algorithm.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> See `Wikipedia on Spectral clustering <http://en.wikipedia.org/wiki/Spectral_clustering>`_</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [(1, 0, 0.5),</span> |
| <span class="sd"> ... (2, 0, 0.5), (2, 1, 0.7),</span> |
| <span class="sd"> ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),</span> |
| <span class="sd"> ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),</span> |
| <span class="sd"> ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1)</span> |
| <span class="sd"> >>> pic = PowerIterationClustering(k=2, weightCol="weight")</span> |
| <span class="sd"> >>> pic.setMaxIter(40)</span> |
| <span class="sd"> PowerIterationClustering...</span> |
| <span class="sd"> >>> assignments = pic.assignClusters(df)</span> |
| <span class="sd"> >>> assignments.sort(assignments.id).show(truncate=False)</span> |
| <span class="sd"> +---+-------+</span> |
| <span class="sd"> |id |cluster|</span> |
| <span class="sd"> +---+-------+</span> |
| <span class="sd"> |0 |0 |</span> |
| <span class="sd"> |1 |0 |</span> |
| <span class="sd"> |2 |0 |</span> |
| <span class="sd"> |3 |0 |</span> |
| <span class="sd"> |4 |0 |</span> |
| <span class="sd"> |5 |1 |</span> |
| <span class="sd"> +---+-------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> pic_path = temp_path + "/pic"</span> |
| <span class="sd"> >>> pic.save(pic_path)</span> |
| <span class="sd"> >>> pic2 = PowerIterationClustering.load(pic_path)</span> |
| <span class="sd"> >>> pic2.getK()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> pic2.getMaxIter()</span> |
| <span class="sd"> 40</span> |
| <span class="sd"> >>> pic2.assignClusters(df).take(6) == assignments.take(6)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"random"</span><span class="p">,</span> |
| <span class="n">srcCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"src"</span><span class="p">,</span> |
| <span class="n">dstCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"dst"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst",\</span> |
| <span class="sd"> weightCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">PowerIterationClustering</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.clustering.PowerIterationClustering"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"random"</span><span class="p">,</span> |
| <span class="n">srcCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"src"</span><span class="p">,</span> |
| <span class="n">dstCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"dst"</span><span class="p">,</span> |
| <span class="n">weightCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst",\</span> |
| <span class="sd"> weightCol=None)</span> |
| <span class="sd"> Sets params for PowerIterationClustering.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setInitMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setInitMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInitMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`initMode`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">initMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setSrcCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setSrcCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSrcCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`srcCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">srcCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setDstCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setDstCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDstCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`dstCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dstCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setMaxIter">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.setWeightCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.setWeightCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWeightCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClustering"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`weightCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">weightCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.assignClusters"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.clustering.PowerIterationClustering.html#pyspark.ml.clustering.PowerIterationClustering.assignClusters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">assignClusters</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Run the PIC algorithm and returns a cluster assignment for each input vertex.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dataset : :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> A dataset with columns src, dst, weight representing the affinity matrix,</span> |
| <span class="sd"> which is the matrix A in the PIC paper. Suppose the src column value is i,</span> |
| <span class="sd"> the dst column value is j, the weight column value is similarity s,,ij,,</span> |
| <span class="sd"> which must be nonnegative. This is a symmetric matrix and hence</span> |
| <span class="sd"> s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be</span> |
| <span class="sd"> either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are</span> |
| <span class="sd"> ignored, because we assume s,,ij,, = 0.0.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> A dataset that contains columns of vertex id and the corresponding cluster for</span> |
| <span class="sd"> the id. The schema of it will be:</span> |
| <span class="sd"> - id: Long</span> |
| <span class="sd"> - cluster: Int</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_transfer_params_to_java</span><span class="p">()</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">assignClusters</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">_jdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="n">dataset</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> |
| <span class="kn">import</span> <span class="nn">pyspark.ml.clustering</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="c1"># Numpy 1.14+ changed it's string format.</span> |
| <span class="n">numpy</span><span class="o">.</span><span class="n">set_printoptions</span><span class="p">(</span><span class="n">legacy</span><span class="o">=</span><span class="s2">"1.13"</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">clustering</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="c1"># The small batch size here ensures that we see multiple batches,</span> |
| <span class="c1"># even in these small test examples:</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[2]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"ml.clustering tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"spark"</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span> |
| <span class="kn">import</span> <span class="nn">tempfile</span> |
| |
| <span class="n">temp_path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"temp_path"</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp_path</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">finally</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">shutil</span> <span class="kn">import</span> <span class="n">rmtree</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">rmtree</span><span class="p">(</span><span class="n">temp_path</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |