| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.mllib.clustering — PySpark 3.3.1 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.mllib.clustering</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">array</span> <span class="k">as</span> <span class="nn">pyarray</span> |
| <span class="kn">from</span> <span class="nn">math</span> <span class="kn">import</span> <span class="n">exp</span><span class="p">,</span> <span class="n">log</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">namedtuple</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">TypeVar</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">overload</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| <span class="kn">from</span> <span class="nn">numpy</span> <span class="kn">import</span> <span class="n">array</span><span class="p">,</span> <span class="n">random</span><span class="p">,</span> <span class="n">tile</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">since</span> |
| <span class="kn">from</span> <span class="nn">pyspark.rdd</span> <span class="kn">import</span> <span class="n">RDD</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.common</span> <span class="kn">import</span> <span class="n">JavaModelWrapper</span><span class="p">,</span> <span class="n">callMLlibFunc</span><span class="p">,</span> <span class="n">callJavaFunc</span><span class="p">,</span> <span class="n">_py2java</span><span class="p">,</span> <span class="n">_java2py</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">SparseVector</span><span class="p">,</span> <span class="n">_convert_to_vector</span><span class="p">,</span> <span class="n">DenseVector</span> <span class="c1"># noqa: F401</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.stat.distribution</span> <span class="kn">import</span> <span class="n">MultivariateGaussian</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">Saveable</span><span class="p">,</span> <span class="n">Loader</span><span class="p">,</span> <span class="n">inherit_doc</span><span class="p">,</span> <span class="n">JavaLoader</span><span class="p">,</span> <span class="n">JavaSaveable</span> |
| <span class="kn">from</span> <span class="nn">pyspark.streaming</span> <span class="kn">import</span> <span class="n">DStream</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib._typing</span> <span class="kn">import</span> <span class="n">VectorLike</span> |
| |
| <span class="n">T</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"T"</span><span class="p">)</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="s2">"BisectingKMeansModel"</span><span class="p">,</span> |
| <span class="s2">"BisectingKMeans"</span><span class="p">,</span> |
| <span class="s2">"KMeansModel"</span><span class="p">,</span> |
| <span class="s2">"KMeans"</span><span class="p">,</span> |
| <span class="s2">"GaussianMixtureModel"</span><span class="p">,</span> |
| <span class="s2">"GaussianMixture"</span><span class="p">,</span> |
| <span class="s2">"PowerIterationClusteringModel"</span><span class="p">,</span> |
| <span class="s2">"PowerIterationClustering"</span><span class="p">,</span> |
| <span class="s2">"StreamingKMeans"</span><span class="p">,</span> |
| <span class="s2">"StreamingKMeansModel"</span><span class="p">,</span> |
| <span class="s2">"LDA"</span><span class="p">,</span> |
| <span class="s2">"LDAModel"</span><span class="p">,</span> |
| <span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="BisectingKMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.BisectingKMeansModel.html#pyspark.mllib.clustering.BisectingKMeansModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">BisectingKMeansModel</span><span class="p">(</span><span class="n">JavaModelWrapper</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> A clustering model derived from the bisecting k-means method.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)</span> |
| <span class="sd"> >>> bskm = BisectingKMeans()</span> |
| <span class="sd"> >>> model = bskm.train(sc.parallelize(data, 2), k=4)</span> |
| <span class="sd"> >>> p = array([0.0, 0.0])</span> |
| <span class="sd"> >>> model.predict(p)</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> model.k</span> |
| <span class="sd"> 4</span> |
| <span class="sd"> >>> model.computeCost(p)</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">BisectingKMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">centers</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"clusterCenters"</span><span class="p">)]</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]:</span> |
| <span class="sd">"""Get the cluster centers, represented as a list of NumPy</span> |
| <span class="sd"> arrays."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">centers</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""Get the number of clusters"""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"k"</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"VectorLike"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.BisectingKMeansModel.html#pyspark.mllib.clustering.BisectingKMeansModel.predict">[docs]</a> <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Find the cluster that each of the points belongs to in this</span> |
| <span class="sd"> model.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> A data point (or RDD of points) to determine cluster index.</span> |
| <span class="sd"> :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent</span> |
| <span class="sd"> objects (list, tuple, numpy.ndarray).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int or :py:class:`pyspark.RDD` of int</span> |
| <span class="sd"> Predicted cluster index or an RDD of predicted cluster indices</span> |
| <span class="sd"> if the input is an RDD.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">vecs</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"predict"</span><span class="p">,</span> <span class="n">vecs</span><span class="p">)</span> |
| |
| <span class="n">x</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"predict"</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BisectingKMeansModel.computeCost"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.BisectingKMeansModel.html#pyspark.mllib.clustering.BisectingKMeansModel.computeCost">[docs]</a> <span class="k">def</span> <span class="nf">computeCost</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]])</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the Bisecting K-means cost (sum of squared distances of</span> |
| <span class="sd"> points to their nearest center) for this model on the given</span> |
| <span class="sd"> data. If provided with an RDD of points returns the sum.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> A data point (or RDD of points) to compute the cost(s).</span> |
| <span class="sd"> :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent</span> |
| <span class="sd"> objects (list, tuple, numpy.ndarray).</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">vecs</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeCost"</span><span class="p">,</span> <span class="n">vecs</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeCost"</span><span class="p">,</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">x</span><span class="p">))</span></div></div> |
| |
| |
| <div class="viewcode-block" id="BisectingKMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.BisectingKMeans.html#pyspark.mllib.clustering.BisectingKMeans">[docs]</a><span class="k">class</span> <span class="nc">BisectingKMeans</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> A bisecting k-means algorithm based on the paper "A comparison of</span> |
| <span class="sd"> document clustering techniques" by Steinbach, Karypis, and Kumar,</span> |
| <span class="sd"> with modification to fit Spark.</span> |
| <span class="sd"> The algorithm starts from a single cluster that contains all points.</span> |
| <span class="sd"> Iteratively it finds divisible clusters on the bottom level and</span> |
| <span class="sd"> bisects each of them using k-means, until there are `k` leaf</span> |
| <span class="sd"> clusters in total or no leaf clusters are divisible.</span> |
| <span class="sd"> The bisecting steps of clusters on the same level are grouped</span> |
| <span class="sd"> together to increase parallelism. If bisecting all divisible</span> |
| <span class="sd"> clusters on the bottom level would result more than `k` leaf</span> |
| <span class="sd"> clusters, larger clusters get higher priority.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> See the original paper [1]_</span> |
| |
| <span class="sd"> .. [1] Steinbach, M. et al. "A Comparison of Document Clustering Techniques." (2000).</span> |
| <span class="sd"> KDD Workshop on Text Mining, 2000</span> |
| <span class="sd"> http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="BisectingKMeans.train"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.BisectingKMeans.html#pyspark.mllib.clustering.BisectingKMeans.train">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">train</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">minDivisibleClusterSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1888008604</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">BisectingKMeansModel</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Runs the bisecting k-means algorithm return the model.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> Training points as an `RDD` of `Vector` or convertible</span> |
| <span class="sd"> sequence types.</span> |
| <span class="sd"> k : int, optional</span> |
| <span class="sd"> The desired number of leaf clusters. The actual number could</span> |
| <span class="sd"> be smaller if there are no divisible leaf clusters.</span> |
| <span class="sd"> (default: 4)</span> |
| <span class="sd"> maxIterations : int, optional</span> |
| <span class="sd"> Maximum number of iterations allowed to split clusters.</span> |
| <span class="sd"> (default: 20)</span> |
| <span class="sd"> minDivisibleClusterSize : float, optional</span> |
| <span class="sd"> Minimum number of points (if >= 1.0) or the minimum proportion</span> |
| <span class="sd"> of points (if < 1.0) of a divisible cluster.</span> |
| <span class="sd"> (default: 1)</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Random seed value for cluster initialization.</span> |
| <span class="sd"> (default: -1888008604 from classOf[BisectingKMeans].getName.##)</span> |
| <span class="sd"> """</span> |
| <span class="n">java_model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"trainBisectingKMeans"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="n">k</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">,</span> |
| <span class="n">minDivisibleClusterSize</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">BisectingKMeansModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="KMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeansModel.html#pyspark.mllib.clustering.KMeansModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">KMeansModel</span><span class="p">(</span><span class="n">Saveable</span><span class="p">,</span> <span class="n">Loader</span><span class="p">[</span><span class="s2">"KMeansModel"</span><span class="p">]):</span> |
| |
| <span class="sd">"""A clustering model derived from the k-means method.</span> |
| |
| <span class="sd"> .. versionadded:: 0.9.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)</span> |
| <span class="sd"> >>> model = KMeans.train(</span> |
| <span class="sd"> ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random",</span> |
| <span class="sd"> ... seed=50, initializationSteps=5, epsilon=1e-4)</span> |
| <span class="sd"> >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.predict(array([8.0, 9.0])) == model.predict(array([9.0, 8.0]))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.k</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.computeCost(sc.parallelize(data))</span> |
| <span class="sd"> 2.0</span> |
| <span class="sd"> >>> model = KMeans.train(sc.parallelize(data), 2)</span> |
| <span class="sd"> >>> sparse_data = [</span> |
| <span class="sd"> ... SparseVector(3, {1: 1.0}),</span> |
| <span class="sd"> ... SparseVector(3, {1: 1.1}),</span> |
| <span class="sd"> ... SparseVector(3, {2: 1.0}),</span> |
| <span class="sd"> ... SparseVector(3, {2: 1.1})</span> |
| <span class="sd"> ... ]</span> |
| <span class="sd"> >>> model = KMeans.train(sc.parallelize(sparse_data), 2, initializationMode="k-means||",</span> |
| <span class="sd"> ... seed=50, initializationSteps=5, epsilon=1e-4)</span> |
| <span class="sd"> >>> model.predict(array([0., 1., 0.])) == model.predict(array([0, 1.1, 0.]))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.predict(array([0., 0., 1.])) == model.predict(array([0, 0, 1.1]))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.predict(sparse_data[0]) == model.predict(sparse_data[1])</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.predict(sparse_data[2]) == model.predict(sparse_data[3])</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> isinstance(model.clusterCenters, list)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> import os, tempfile</span> |
| <span class="sd"> >>> path = tempfile.mkdtemp()</span> |
| <span class="sd"> >>> model.save(sc, path)</span> |
| <span class="sd"> >>> sameModel = KMeansModel.load(sc, path)</span> |
| <span class="sd"> >>> sameModel.predict(sparse_data[0]) == model.predict(sparse_data[0])</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> from shutil import rmtree</span> |
| <span class="sd"> >>> try:</span> |
| <span class="sd"> ... rmtree(path)</span> |
| <span class="sd"> ... except OSError:</span> |
| <span class="sd"> ... pass</span> |
| |
| <span class="sd"> >>> data = array([-383.1,-382.9, 28.7,31.2, 366.2,367.3]).reshape(3, 2)</span> |
| <span class="sd"> >>> model = KMeans.train(sc.parallelize(data), 3, maxIterations=0,</span> |
| <span class="sd"> ... initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)]))</span> |
| <span class="sd"> >>> model.clusterCenters</span> |
| <span class="sd"> [array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">centers</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">centers</span> <span class="o">=</span> <span class="n">centers</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]:</span> |
| <span class="sd">"""Get the cluster centers, represented as a list of NumPy arrays."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">centers</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""Total number of clusters."""</span> |
| <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"VectorLike"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="KMeansModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeansModel.html#pyspark.mllib.clustering.KMeansModel.predict">[docs]</a> <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Find the cluster that each of the points belongs to in this</span> |
| <span class="sd"> model.</span> |
| |
| <span class="sd"> .. versionadded:: 0.9.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> A data point (or RDD of points) to determine cluster index.</span> |
| <span class="sd"> :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent</span> |
| <span class="sd"> objects (list, tuple, numpy.ndarray).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int or :py:class:`pyspark.RDD` of int</span> |
| <span class="sd"> Predicted cluster index or an RDD of predicted cluster indices</span> |
| <span class="sd"> if the input is an RDD.</span> |
| <span class="sd"> """</span> |
| <span class="n">best</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="n">best_distance</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="s2">"inf"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">x</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">predict</span><span class="p">)</span> |
| |
| <span class="n">x</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">)):</span> |
| <span class="n">distance</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">[</span><span class="n">i</span><span class="p">])</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="k">if</span> <span class="n">distance</span> <span class="o"><</span> <span class="n">best_distance</span><span class="p">:</span> |
| <span class="n">best</span> <span class="o">=</span> <span class="n">i</span> |
| <span class="n">best_distance</span> <span class="o">=</span> <span class="n">distance</span> |
| <span class="k">return</span> <span class="n">best</span></div> |
| |
| <div class="viewcode-block" id="KMeansModel.computeCost"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeansModel.html#pyspark.mllib.clustering.KMeansModel.computeCost">[docs]</a> <span class="k">def</span> <span class="nf">computeCost</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the K-means cost (sum of squared distances of points to</span> |
| <span class="sd"> their nearest center) for this model on the given</span> |
| <span class="sd"> data.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : ::py:class:`pyspark.RDD`</span> |
| <span class="sd"> The RDD of points to compute the cost on.</span> |
| <span class="sd"> """</span> |
| <span class="n">cost</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"computeCostKmeansModel"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="p">[</span><span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">cost</span></div> |
| |
| <div class="viewcode-block" id="KMeansModel.save"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeansModel.html#pyspark.mllib.clustering.KMeansModel.save">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">save</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Save this model to the given path.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">java_centers</span> <span class="o">=</span> <span class="n">_py2java</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">])</span> |
| <span class="n">java_model</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">clustering</span><span class="o">.</span><span class="n">KMeansModel</span><span class="p">(</span><span class="n">java_centers</span><span class="p">)</span> |
| <span class="n">java_model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jsc</span><span class="o">.</span><span class="n">sc</span><span class="p">(),</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="KMeansModel.load"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeansModel.html#pyspark.mllib.clustering.KMeansModel.load">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeansModel"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Load a model from the given path.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">java_model</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">clustering</span><span class="o">.</span><span class="n">KMeansModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jsc</span><span class="o">.</span><span class="n">sc</span><span class="p">(),</span> <span class="n">path</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">KMeansModel</span><span class="p">(</span><span class="n">_java2py</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">java_model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()))</span></div></div> |
| |
| |
| <div class="viewcode-block" id="KMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeans.html#pyspark.mllib.clustering.KMeans">[docs]</a><span class="k">class</span> <span class="nc">KMeans</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> K-means clustering.</span> |
| |
| <span class="sd"> .. versionadded:: 0.9.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="KMeans.train"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.KMeans.html#pyspark.mllib.clustering.KMeans.train">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">train</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">initializationMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"k-means||"</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">initializationSteps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">epsilon</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-4</span><span class="p">,</span> |
| <span class="n">initialModel</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">KMeansModel</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"euclidean"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"KMeansModel"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Train a k-means clustering model.</span> |
| |
| <span class="sd"> .. versionadded:: 0.9.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : ::py:class:`pyspark.RDD`</span> |
| <span class="sd"> Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`</span> |
| <span class="sd"> or convertible sequence types.</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of clusters to create.</span> |
| <span class="sd"> maxIterations : int, optional</span> |
| <span class="sd"> Maximum number of iterations allowed.</span> |
| <span class="sd"> (default: 100)</span> |
| <span class="sd"> initializationMode : str, optional</span> |
| <span class="sd"> The initialization algorithm. This can be either "random" or</span> |
| <span class="sd"> "k-means||".</span> |
| <span class="sd"> (default: "k-means||")</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Random seed value for cluster initialization. Set as None to</span> |
| <span class="sd"> generate seed based on system time.</span> |
| <span class="sd"> (default: None)</span> |
| <span class="sd"> initializationSteps :</span> |
| <span class="sd"> Number of steps for the k-means|| initialization mode.</span> |
| <span class="sd"> This is an advanced setting -- the default of 2 is almost</span> |
| <span class="sd"> always enough.</span> |
| <span class="sd"> (default: 2)</span> |
| <span class="sd"> epsilon : float, optional</span> |
| <span class="sd"> Distance threshold within which a center will be considered to</span> |
| <span class="sd"> have converged. If all centers move less than this Euclidean</span> |
| <span class="sd"> distance, iterations are stopped.</span> |
| <span class="sd"> (default: 1e-4)</span> |
| <span class="sd"> initialModel : :py:class:`KMeansModel`, optional</span> |
| <span class="sd"> Initial cluster centers can be provided as a KMeansModel object</span> |
| <span class="sd"> rather than using the random or k-means|| initializationModel.</span> |
| <span class="sd"> (default: None)</span> |
| <span class="sd"> distanceMeasure : str, optional</span> |
| <span class="sd"> The distance measure used by the k-means algorithm.</span> |
| <span class="sd"> (default: "euclidean")</span> |
| <span class="sd"> """</span> |
| <span class="n">clusterInitialModel</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">if</span> <span class="n">initialModel</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">initialModel</span><span class="p">,</span> <span class="n">KMeansModel</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"initialModel is of "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">initialModel</span><span class="p">))</span> <span class="o">+</span> <span class="s2">". It needs "</span> |
| <span class="s2">"to be of <type 'KMeansModel'>"</span> |
| <span class="p">)</span> |
| <span class="n">clusterInitialModel</span> <span class="o">=</span> <span class="p">[</span><span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">initialModel</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">]</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"trainKMeansModel"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="n">k</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">,</span> |
| <span class="n">initializationMode</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">,</span> |
| <span class="n">initializationSteps</span><span class="p">,</span> |
| <span class="n">epsilon</span><span class="p">,</span> |
| <span class="n">clusterInitialModel</span><span class="p">,</span> |
| <span class="n">distanceMeasure</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">centers</span> <span class="o">=</span> <span class="n">callJavaFunc</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">context</span><span class="p">,</span> <span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">KMeansModel</span><span class="p">([</span><span class="n">c</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">centers</span><span class="p">])</span></div></div> |
| |
| |
| <div class="viewcode-block" id="GaussianMixtureModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixtureModel.html#pyspark.mllib.clustering.GaussianMixtureModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">GaussianMixtureModel</span><span class="p">(</span><span class="n">JavaModelWrapper</span><span class="p">,</span> <span class="n">JavaSaveable</span><span class="p">,</span> <span class="n">JavaLoader</span><span class="p">[</span><span class="s2">"GaussianMixtureModel"</span><span class="p">]):</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd"> A clustering model derived from the Gaussian Mixture Model method.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.mllib.linalg import Vectors, DenseMatrix</span> |
| <span class="sd"> >>> from numpy.testing import assert_equal</span> |
| <span class="sd"> >>> from shutil import rmtree</span> |
| <span class="sd"> >>> import os, tempfile</span> |
| |
| <span class="sd"> >>> clusterdata_1 = sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,</span> |
| <span class="sd"> ... 0.9,0.8,0.75,0.935,</span> |
| <span class="sd"> ... -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2), 2)</span> |
| <span class="sd"> >>> model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.0001,</span> |
| <span class="sd"> ... maxIterations=50, seed=10)</span> |
| <span class="sd"> >>> labels = model.predict(clusterdata_1).collect()</span> |
| <span class="sd"> >>> labels[0]==labels[1]</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> labels[1]==labels[2]</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> labels[4]==labels[5]</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.predict([-0.1,-0.05])</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> softPredicted = model.predictSoft([-0.1,-0.05])</span> |
| <span class="sd"> >>> abs(softPredicted[0] - 1.0) < 0.03</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> abs(softPredicted[1] - 0.0) < 0.03</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> abs(softPredicted[2] - 0.0) < 0.03</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> >>> path = tempfile.mkdtemp()</span> |
| <span class="sd"> >>> model.save(sc, path)</span> |
| <span class="sd"> >>> sameModel = GaussianMixtureModel.load(sc, path)</span> |
| <span class="sd"> >>> assert_equal(model.weights, sameModel.weights)</span> |
| <span class="sd"> >>> mus, sigmas = list(</span> |
| <span class="sd"> ... zip(*[(g.mu, g.sigma) for g in model.gaussians]))</span> |
| <span class="sd"> >>> sameMus, sameSigmas = list(</span> |
| <span class="sd"> ... zip(*[(g.mu, g.sigma) for g in sameModel.gaussians]))</span> |
| <span class="sd"> >>> mus == sameMus</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> sigmas == sameSigmas</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> from shutil import rmtree</span> |
| <span class="sd"> >>> try:</span> |
| <span class="sd"> ... rmtree(path)</span> |
| <span class="sd"> ... except OSError:</span> |
| <span class="sd"> ... pass</span> |
| |
| <span class="sd"> >>> data = array([-5.1971, -2.5359, -3.8220,</span> |
| <span class="sd"> ... -5.2211, -5.0602, 4.7118,</span> |
| <span class="sd"> ... 6.8989, 3.4592, 4.6322,</span> |
| <span class="sd"> ... 5.7048, 4.6567, 5.5026,</span> |
| <span class="sd"> ... 4.5605, 5.2043, 6.2734])</span> |
| <span class="sd"> >>> clusterdata_2 = sc.parallelize(data.reshape(5,3))</span> |
| <span class="sd"> >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,</span> |
| <span class="sd"> ... maxIterations=150, seed=4)</span> |
| <span class="sd"> >>> labels = model.predict(clusterdata_2).collect()</span> |
| <span class="sd"> >>> labels[0]==labels[1]</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> labels[2]==labels[3]==labels[4]</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">weights</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Weights for each Gaussian distribution in the mixture, where weights[i] is</span> |
| <span class="sd"> the weight for Gaussian i, and weights.sum == 1.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"weights"</span><span class="p">))</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">gaussians</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">MultivariateGaussian</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Array of MultivariateGaussian where gaussians[i] represents</span> |
| <span class="sd"> the Multivariate Gaussian (Normal) Distribution for Gaussian i.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">[</span> |
| <span class="n">MultivariateGaussian</span><span class="p">(</span><span class="n">gaussian</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">gaussian</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">gaussian</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"gaussians"</span><span class="p">)</span> |
| <span class="p">]</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""Number of gaussians in mixture."""</span> |
| <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">weights</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"VectorLike"</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">int64</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.predict"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixtureModel.html#pyspark.mllib.clustering.GaussianMixtureModel.predict">[docs]</a> <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Find the cluster to which the point 'x' or each point in RDD 'x'</span> |
| <span class="sd"> has maximum membership in this model.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> A feature vector or an RDD of vectors representing data points.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> numpy.float64 or :py:class:`pyspark.RDD` of int</span> |
| <span class="sd"> Predicted cluster label or an RDD of predicted cluster labels</span> |
| <span class="sd"> if the input is an RDD.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">cluster_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">predictSoft</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">z</span><span class="p">:</span> <span class="n">z</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="nb">max</span><span class="p">(</span><span class="n">z</span><span class="p">)))</span> |
| <span class="k">return</span> <span class="n">cluster_labels</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">z</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">predictSoft</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">z</span><span class="o">.</span><span class="n">argmax</span><span class="p">()</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predictSoft</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"VectorLike"</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">predictSoft</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="n">pyarray</span><span class="o">.</span><span class="n">array</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.predictSoft"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixtureModel.html#pyspark.mllib.clustering.GaussianMixtureModel.predictSoft">[docs]</a> <span class="k">def</span> <span class="nf">predictSoft</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">RDD</span><span class="p">[</span><span class="n">pyarray</span><span class="o">.</span><span class="n">array</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Find the membership of point 'x' or each point in RDD 'x' to all mixture components.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> A feature vector or an RDD of vectors representing data points.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> numpy.ndarray or :py:class:`pyspark.RDD`</span> |
| <span class="sd"> The membership value to all mixture components for vector 'x'</span> |
| <span class="sd"> or each vector in RDD 'x'.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">means</span><span class="p">,</span> <span class="n">sigmas</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="p">[(</span><span class="n">g</span><span class="o">.</span><span class="n">mu</span><span class="p">,</span> <span class="n">g</span><span class="o">.</span><span class="n">sigma</span><span class="p">)</span> <span class="k">for</span> <span class="n">g</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">gaussians</span><span class="p">])</span> |
| <span class="n">membership_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"predictSoftGMM"</span><span class="p">,</span> |
| <span class="n">x</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="n">_convert_to_vector</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">weights</span><span class="p">),</span> |
| <span class="n">means</span><span class="p">,</span> |
| <span class="n">sigmas</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">membership_matrix</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">pyarray</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s2">"d"</span><span class="p">,</span> <span class="n">x</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"predictSoft"</span><span class="p">,</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="GaussianMixtureModel.load"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixtureModel.html#pyspark.mllib.clustering.GaussianMixtureModel.load">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GaussianMixtureModel"</span><span class="p">:</span> |
| <span class="sd">"""Load the GaussianMixtureModel from disk.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sc : :py:class:`SparkContext`</span> |
| <span class="sd"> path : str</span> |
| <span class="sd"> Path to where the model is stored.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">model</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_load_java</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span> |
| <span class="n">wrapper</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">python</span><span class="o">.</span><span class="n">GaussianMixtureModelWrapper</span><span class="p">(</span><span class="n">model</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="n">wrapper</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="GaussianMixture"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixture.html#pyspark.mllib.clustering.GaussianMixture">[docs]</a><span class="k">class</span> <span class="nc">GaussianMixture</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="GaussianMixture.train"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.GaussianMixture.html#pyspark.mllib.clustering.GaussianMixture.train">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">train</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">convergenceTol</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-3</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">initialModel</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">GaussianMixtureModel</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">GaussianMixtureModel</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Train a Gaussian Mixture clustering model.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : ::py:class:`pyspark.RDD`</span> |
| <span class="sd"> Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`</span> |
| <span class="sd"> or convertible sequence types.</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of independent Gaussians in the mixture model.</span> |
| <span class="sd"> convergenceTol : float, optional</span> |
| <span class="sd"> Maximum change in log-likelihood at which convergence is</span> |
| <span class="sd"> considered to have occurred.</span> |
| <span class="sd"> (default: 1e-3)</span> |
| <span class="sd"> maxIterations : int, optional</span> |
| <span class="sd"> Maximum number of iterations allowed.</span> |
| <span class="sd"> (default: 100)</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Random seed for initial Gaussian distribution. Set as None to</span> |
| <span class="sd"> generate seed based on system time.</span> |
| <span class="sd"> (default: None)</span> |
| <span class="sd"> initialModel : GaussianMixtureModel, optional</span> |
| <span class="sd"> Initial GMM starting point, bypassing the random</span> |
| <span class="sd"> initialization.</span> |
| <span class="sd"> (default: None)</span> |
| <span class="sd"> """</span> |
| <span class="n">initialModelWeights</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">initialModelMu</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">initialModelSigma</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">initialModel</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">initialModel</span><span class="o">.</span><span class="n">k</span> <span class="o">!=</span> <span class="n">k</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Mismatched cluster count, initialModel.k = </span><span class="si">%s</span><span class="s2">, however k = </span><span class="si">%s</span><span class="s2">"</span> |
| <span class="o">%</span> <span class="p">(</span><span class="n">initialModel</span><span class="o">.</span><span class="n">k</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">initialModelWeights</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">initialModel</span><span class="o">.</span><span class="n">weights</span><span class="p">)</span> |
| <span class="n">initialModelMu</span> <span class="o">=</span> <span class="p">[</span><span class="n">initialModel</span><span class="o">.</span><span class="n">gaussians</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">mu</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">initialModel</span><span class="o">.</span><span class="n">k</span><span class="p">)]</span> |
| <span class="n">initialModelSigma</span> <span class="o">=</span> <span class="p">[</span><span class="n">initialModel</span><span class="o">.</span><span class="n">gaussians</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">sigma</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">initialModel</span><span class="o">.</span><span class="n">k</span><span class="p">)]</span> |
| <span class="n">java_model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"trainGaussianMixtureModel"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="n">k</span><span class="p">,</span> |
| <span class="n">convergenceTol</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">,</span> |
| <span class="n">initialModelWeights</span><span class="p">,</span> |
| <span class="n">initialModelMu</span><span class="p">,</span> |
| <span class="n">initialModelSigma</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">GaussianMixtureModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="PowerIterationClusteringModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.PowerIterationClusteringModel.html#pyspark.mllib.clustering.PowerIterationClusteringModel">[docs]</a><span class="k">class</span> <span class="nc">PowerIterationClusteringModel</span><span class="p">(</span> |
| <span class="n">JavaModelWrapper</span><span class="p">,</span> <span class="n">JavaSaveable</span><span class="p">,</span> <span class="n">JavaLoader</span><span class="p">[</span><span class="s2">"PowerIterationClusteringModel"</span><span class="p">]</span> |
| <span class="p">):</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd"> Model produced by :py:class:`PowerIterationClustering`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> def genCircle(r, n):</span> |
| <span class="sd"> ... points = []</span> |
| <span class="sd"> ... for i in range(0, n):</span> |
| <span class="sd"> ... theta = 2.0 * math.pi * i / n</span> |
| <span class="sd"> ... points.append((r * math.cos(theta), r * math.sin(theta)))</span> |
| <span class="sd"> ... return points</span> |
| <span class="sd"> >>> def sim(x, y):</span> |
| <span class="sd"> ... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])</span> |
| <span class="sd"> ... return math.exp(-dist2 / 2.0)</span> |
| <span class="sd"> >>> r1 = 1.0</span> |
| <span class="sd"> >>> n1 = 10</span> |
| <span class="sd"> >>> r2 = 4.0</span> |
| <span class="sd"> >>> n2 = 40</span> |
| <span class="sd"> >>> n = n1 + n2</span> |
| <span class="sd"> >>> points = genCircle(r1, n1) + genCircle(r2, n2)</span> |
| <span class="sd"> >>> similarities = [(i, j, sim(points[i], points[j])) for i in range(1, n) for j in range(0, i)]</span> |
| <span class="sd"> >>> rdd = sc.parallelize(similarities, 2)</span> |
| <span class="sd"> >>> model = PowerIterationClustering.train(rdd, 2, 40)</span> |
| <span class="sd"> >>> model.k</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)</span> |
| <span class="sd"> >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> import os, tempfile</span> |
| <span class="sd"> >>> path = tempfile.mkdtemp()</span> |
| <span class="sd"> >>> model.save(sc, path)</span> |
| <span class="sd"> >>> sameModel = PowerIterationClusteringModel.load(sc, path)</span> |
| <span class="sd"> >>> sameModel.k</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)</span> |
| <span class="sd"> >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> from shutil import rmtree</span> |
| <span class="sd"> >>> try:</span> |
| <span class="sd"> ... rmtree(path)</span> |
| <span class="sd"> ... except OSError:</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">k</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns the number of clusters.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"k"</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="PowerIterationClusteringModel.assignments"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.PowerIterationClusteringModel.html#pyspark.mllib.clustering.PowerIterationClusteringModel.assignments">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">assignments</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"PowerIterationClustering.Assignment"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns the cluster assignments of this model.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"getAssignments"</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">PowerIterationClustering</span><span class="o">.</span><span class="n">Assignment</span><span class="p">(</span><span class="o">*</span><span class="n">x</span><span class="p">)))</span></div> |
| |
| <div class="viewcode-block" id="PowerIterationClusteringModel.load"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.PowerIterationClusteringModel.html#pyspark.mllib.clustering.PowerIterationClusteringModel.load">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PowerIterationClusteringModel"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Load a model from the given path.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="n">model</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_load_java</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span> |
| <span class="n">wrapper</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">python</span><span class="o">.</span><span class="n">PowerIterationClusteringModelWrapper</span><span class="p">(</span> |
| <span class="n">model</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">PowerIterationClusteringModel</span><span class="p">(</span><span class="n">wrapper</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="PowerIterationClustering"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.PowerIterationClustering.html#pyspark.mllib.clustering.PowerIterationClustering">[docs]</a><span class="k">class</span> <span class="nc">PowerIterationClustering</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Power Iteration Clustering (PIC), a scalable graph clustering algorithm.</span> |
| |
| |
| <span class="sd"> Developed by Lin and Cohen [1]_. From the abstract:</span> |
| |
| <span class="sd"> "PIC finds a very low-dimensional embedding of a</span> |
| <span class="sd"> dataset using truncated power iteration on a normalized pair-wise</span> |
| <span class="sd"> similarity matrix of the data."</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. [1] Lin, Frank & Cohen, William. (2010). Power Iteration Clustering.</span> |
| <span class="sd"> http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="PowerIterationClustering.train"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.PowerIterationClustering.html#pyspark.mllib.clustering.PowerIterationClustering.train">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">train</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]],</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">initMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"random"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">PowerIterationClusteringModel</span><span class="p">:</span> |
| <span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Train PowerIterationClusteringModel</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> An RDD of (i, j, s\ :sub:`ij`\) tuples representing the</span> |
| <span class="sd"> affinity matrix, which is the matrix A in the PIC paper. The</span> |
| <span class="sd"> similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric</span> |
| <span class="sd"> matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with</span> |
| <span class="sd"> nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or</span> |
| <span class="sd"> (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored,</span> |
| <span class="sd"> because it is assumed s\ :sub:`ij`\ = 0.0.</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of clusters.</span> |
| <span class="sd"> maxIterations : int, optional</span> |
| <span class="sd"> Maximum number of iterations of the PIC algorithm.</span> |
| <span class="sd"> (default: 100)</span> |
| <span class="sd"> initMode : str, optional</span> |
| <span class="sd"> Initialization mode. This can be either "random" to use</span> |
| <span class="sd"> a random vector as vertex properties, or "degree" to use</span> |
| <span class="sd"> normalized sum similarities.</span> |
| <span class="sd"> (default: "random")</span> |
| <span class="sd"> """</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"trainPowerIterationClusteringModel"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">k</span><span class="p">),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">maxIterations</span><span class="p">),</span> |
| <span class="n">initMode</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">PowerIterationClusteringModel</span><span class="p">(</span><span class="n">model</span><span class="p">)</span></div> |
| |
| <span class="k">class</span> <span class="nc">Assignment</span><span class="p">(</span><span class="n">namedtuple</span><span class="p">(</span><span class="s2">"Assignment"</span><span class="p">,</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"cluster"</span><span class="p">])):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents an (id, cluster) tuple.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| <span class="sd"> """</span></div> |
| |
| |
| <div class="viewcode-block" id="StreamingKMeansModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeansModel.html#pyspark.mllib.clustering.StreamingKMeansModel">[docs]</a><span class="k">class</span> <span class="nc">StreamingKMeansModel</span><span class="p">(</span><span class="n">KMeansModel</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Clustering model which can perform an online update of the centroids.</span> |
| |
| <span class="sd"> The update formula for each centroid is given by</span> |
| |
| <span class="sd"> - c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t)</span> |
| <span class="sd"> - n_t+1 = n_t * a + m_t</span> |
| |
| <span class="sd"> where</span> |
| |
| <span class="sd"> - c_t: Centroid at the n_th iteration.</span> |
| <span class="sd"> - n_t: Number of samples (or) weights associated with the centroid</span> |
| <span class="sd"> at the n_th iteration.</span> |
| <span class="sd"> - x_t: Centroid of the new data closest to c_t.</span> |
| <span class="sd"> - m_t: Number of samples (or) weights of the new data closest to c_t</span> |
| <span class="sd"> - c_t+1: New centroid.</span> |
| <span class="sd"> - n_t+1: New number of weights.</span> |
| <span class="sd"> - a: Decay Factor, which gives the forgetfulness.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> clusterCenters : list of :py:class:`pyspark.mllib.linalg.Vector` or covertible</span> |
| <span class="sd"> Initial cluster centers.</span> |
| <span class="sd"> clusterWeights : :py:class:`pyspark.mllib.linalg.Vector` or covertible</span> |
| <span class="sd"> List of weights assigned to each cluster.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> If a is set to 1, it is the weighted mean of the previous</span> |
| <span class="sd"> and new data. If it set to zero, the old centroids are completely</span> |
| <span class="sd"> forgotten.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> initCenters = [[0.0, 0.0], [1.0, 1.0]]</span> |
| <span class="sd"> >>> initWeights = [1.0, 1.0]</span> |
| <span class="sd"> >>> stkm = StreamingKMeansModel(initCenters, initWeights)</span> |
| <span class="sd"> >>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1],</span> |
| <span class="sd"> ... [0.9, 0.9], [1.1, 1.1]])</span> |
| <span class="sd"> >>> stkm = stkm.update(data, 1.0, "batches")</span> |
| <span class="sd"> >>> stkm.centers</span> |
| <span class="sd"> array([[ 0., 0.],</span> |
| <span class="sd"> [ 1., 1.]])</span> |
| <span class="sd"> >>> stkm.predict([-0.1, -0.1])</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> stkm.predict([0.9, 0.9])</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> >>> stkm.clusterWeights</span> |
| <span class="sd"> [3.0, 3.0]</span> |
| <span class="sd"> >>> decayFactor = 0.0</span> |
| <span class="sd"> >>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])])</span> |
| <span class="sd"> >>> stkm = stkm.update(data, 0.0, "batches")</span> |
| <span class="sd"> >>> stkm.centers</span> |
| <span class="sd"> array([[ 0.2, 0.2],</span> |
| <span class="sd"> [ 1.5, 1.5]])</span> |
| <span class="sd"> >>> stkm.clusterWeights</span> |
| <span class="sd"> [1.0, 1.0]</span> |
| <span class="sd"> >>> stkm.predict([0.2, 0.2])</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> stkm.predict([1.5, 1.5])</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">clusterCenters</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> <span class="n">clusterWeights</span><span class="p">:</span> <span class="s2">"VectorLike"</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">StreamingKMeansModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">centers</span><span class="o">=</span><span class="n">clusterCenters</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_clusterWeights</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">clusterWeights</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span> |
| |
| <span class="nd">@property</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">clusterWeights</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">float64</span><span class="p">]:</span> |
| <span class="sd">"""Return the cluster weights."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_clusterWeights</span> |
| |
| <div class="viewcode-block" id="StreamingKMeansModel.update"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeansModel.html#pyspark.mllib.clustering.StreamingKMeansModel.update">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">update</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> <span class="n">decayFactor</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">timeUnit</span><span class="p">:</span> <span class="nb">str</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeansModel"</span><span class="p">:</span> |
| <span class="sd">"""Update the centroids, according to data</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> RDD with new data for the model update.</span> |
| <span class="sd"> decayFactor : float</span> |
| <span class="sd"> Forgetfulness of the previous centroids.</span> |
| <span class="sd"> timeUnit : str</span> |
| <span class="sd"> Can be "batches" or "points". If points, then the decay factor</span> |
| <span class="sd"> is raised to the power of number of new points and if batches,</span> |
| <span class="sd"> then decay factor will be used as is.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Data should be of an RDD, got </span><span class="si">%s</span><span class="s2">."</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">data</span><span class="p">))</span> |
| <span class="n">data</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">)</span> |
| <span class="n">decayFactor</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">decayFactor</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">timeUnit</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"batches"</span><span class="p">,</span> <span class="s2">"points"</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"timeUnit should be 'batches' or 'points', got </span><span class="si">%s</span><span class="s2">."</span> <span class="o">%</span> <span class="n">timeUnit</span><span class="p">)</span> |
| <span class="n">vectorCenters</span> <span class="o">=</span> <span class="p">[</span><span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">center</span><span class="p">)</span> <span class="k">for</span> <span class="n">center</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">centers</span><span class="p">]</span> |
| <span class="n">updatedModel</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"updateStreamingKMeansModel"</span><span class="p">,</span> |
| <span class="n">vectorCenters</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_clusterWeights</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">,</span> |
| <span class="n">decayFactor</span><span class="p">,</span> |
| <span class="n">timeUnit</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">centers</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="n">updatedModel</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_clusterWeights</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">updatedModel</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> |
| <span class="k">return</span> <span class="bp">self</span></div></div> |
| |
| |
| <div class="viewcode-block" id="StreamingKMeans"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans">[docs]</a><span class="k">class</span> <span class="nc">StreamingKMeans</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Provides methods to set k, decayFactor, timeUnit to configure the</span> |
| <span class="sd"> KMeans algorithm for fitting and predicting on incoming dstreams.</span> |
| <span class="sd"> More details on how the centroids are updated are provided under the</span> |
| <span class="sd"> docs of StreamingKMeansModel.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> k : int, optional</span> |
| <span class="sd"> Number of clusters.</span> |
| <span class="sd"> (default: 2)</span> |
| <span class="sd"> decayFactor : float, optional</span> |
| <span class="sd"> Forgetfulness of the previous centroids.</span> |
| <span class="sd"> (default: 1.0)</span> |
| <span class="sd"> timeUnit : str, optional</span> |
| <span class="sd"> Can be "batches" or "points". If points, then the decay factor is</span> |
| <span class="sd"> raised to the power of number of new points and if batches, then</span> |
| <span class="sd"> decay factor will be used as is.</span> |
| <span class="sd"> (default: "batches")</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">decayFactor</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">timeUnit</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"batches"</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_k</span> <span class="o">=</span> <span class="n">k</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_decayFactor</span> <span class="o">=</span> <span class="n">decayFactor</span> |
| <span class="k">if</span> <span class="n">timeUnit</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"batches"</span><span class="p">,</span> <span class="s2">"points"</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"timeUnit should be 'batches' or 'points', got </span><span class="si">%s</span><span class="s2">."</span> <span class="o">%</span> <span class="n">timeUnit</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_timeUnit</span> <span class="o">=</span> <span class="n">timeUnit</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">StreamingKMeansModel</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <div class="viewcode-block" id="StreamingKMeans.latestModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.latestModel">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">latestModel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">StreamingKMeansModel</span><span class="p">]:</span> |
| <span class="sd">"""Return the latest model"""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_model</span></div> |
| |
| <span class="k">def</span> <span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dstream</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_model</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Initial centers should be set either by setInitialCenters "</span> <span class="s2">"or setRandomCenters."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dstream</span><span class="p">,</span> <span class="n">DStream</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Expected dstream to be of type DStream, "</span> <span class="s2">"got type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">dstream</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="StreamingKMeans.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeans"</span><span class="p">:</span> |
| <span class="sd">"""Set number of clusters."""</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_k</span> <span class="o">=</span> <span class="n">k</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.setDecayFactor"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.setDecayFactor">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDecayFactor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">decayFactor</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeans"</span><span class="p">:</span> |
| <span class="sd">"""Set decay factor."""</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_decayFactor</span> <span class="o">=</span> <span class="n">decayFactor</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.setHalfLife"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.setHalfLife">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setHalfLife</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">halfLife</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">timeUnit</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeans"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Set number of batches after which the centroids of that</span> |
| <span class="sd"> particular batch has half the weightage.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_timeUnit</span> <span class="o">=</span> <span class="n">timeUnit</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_decayFactor</span> <span class="o">=</span> <span class="n">exp</span><span class="p">(</span><span class="n">log</span><span class="p">(</span><span class="mf">0.5</span><span class="p">)</span> <span class="o">/</span> <span class="n">halfLife</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.setInitialCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.setInitialCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInitialCenters</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">centers</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">],</span> <span class="n">weights</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeans"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Set initial centers. Should be set before calling trainOn.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_model</span> <span class="o">=</span> <span class="n">StreamingKMeansModel</span><span class="p">(</span><span class="n">centers</span><span class="p">,</span> <span class="n">weights</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.setRandomCenters"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.setRandomCenters">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setRandomCenters</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">weight</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">seed</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingKMeans"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Set the initial centers to be random samples from</span> |
| <span class="sd"> a gaussian population with constant weights.</span> |
| <span class="sd"> """</span> |
| <span class="n">rng</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">RandomState</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span> |
| <span class="n">clusterCenters</span> <span class="o">=</span> <span class="n">rng</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_k</span><span class="p">,</span> <span class="n">dim</span><span class="p">)</span> |
| <span class="n">clusterWeights</span> <span class="o">=</span> <span class="n">tile</span><span class="p">(</span><span class="n">weight</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_k</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_model</span> <span class="o">=</span> <span class="n">StreamingKMeansModel</span><span class="p">(</span><span class="n">clusterCenters</span><span class="p">,</span> <span class="n">clusterWeights</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.trainOn"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.trainOn">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">trainOn</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dstream</span><span class="p">:</span> <span class="s2">"DStream[VectorLike]"</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""Train the model on the incoming dstream."""</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">(</span><span class="n">dstream</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="s2">"VectorLike"</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_model</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decayFactor</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timeUnit</span><span class="p">)</span> <span class="c1"># type: ignore[union-attr]</span> |
| |
| <span class="n">dstream</span><span class="o">.</span><span class="n">foreachRDD</span><span class="p">(</span><span class="n">update</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.predictOn"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.predictOn">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predictOn</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dstream</span><span class="p">:</span> <span class="s2">"DStream[VectorLike]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DStream[int]"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Make predictions on a dstream.</span> |
| <span class="sd"> Returns a transformed dstream object</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">(</span><span class="n">dstream</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">dstream</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">x</span><span class="p">))</span> <span class="c1"># type: ignore[union-attr]</span></div> |
| |
| <div class="viewcode-block" id="StreamingKMeans.predictOnValues"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.StreamingKMeans.html#pyspark.mllib.clustering.StreamingKMeans.predictOnValues">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">predictOnValues</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dstream</span><span class="p">:</span> <span class="s2">"DStream[Tuple[T, VectorLike]]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DStream[Tuple[T, int]]"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Make predictions on a keyed dstream.</span> |
| <span class="sd"> Returns a transformed dstream object.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">(</span><span class="n">dstream</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">dstream</span><span class="o">.</span><span class="n">mapValues</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">x</span><span class="p">))</span> <span class="c1"># type: ignore[union-attr]</span></div></div> |
| |
| |
| <div class="viewcode-block" id="LDAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDAModel.html#pyspark.mllib.clustering.LDAModel">[docs]</a><span class="k">class</span> <span class="nc">LDAModel</span><span class="p">(</span><span class="n">JavaModelWrapper</span><span class="p">,</span> <span class="n">JavaSaveable</span><span class="p">,</span> <span class="n">Loader</span><span class="p">[</span><span class="s2">"LDAModel"</span><span class="p">]):</span> |
| |
| <span class="sd">"""A clustering model derived from the LDA method.</span> |
| |
| <span class="sd"> Latent Dirichlet Allocation (LDA), a topic model designed for text documents.</span> |
| <span class="sd"> Terminology</span> |
| |
| <span class="sd"> - "word" = "term": an element of the vocabulary</span> |
| <span class="sd"> - "token": instance of a term appearing in a document</span> |
| <span class="sd"> - "topic": multinomial distribution over words representing some concept</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> See the original LDA paper (journal version) [1]_</span> |
| |
| <span class="sd"> .. [1] Blei, D. et al. "Latent Dirichlet Allocation."</span> |
| <span class="sd"> J. Mach. Learn. Res. 3 (2003): 993-1022.</span> |
| <span class="sd"> https://www.jmlr.org/papers/v3/blei03a</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span> |
| <span class="sd"> >>> from numpy.testing import assert_almost_equal, assert_equal</span> |
| <span class="sd"> >>> data = [</span> |
| <span class="sd"> ... [1, Vectors.dense([0.0, 1.0])],</span> |
| <span class="sd"> ... [2, SparseVector(2, {0: 1.0})],</span> |
| <span class="sd"> ... ]</span> |
| <span class="sd"> >>> rdd = sc.parallelize(data)</span> |
| <span class="sd"> >>> model = LDA.train(rdd, k=2, seed=1)</span> |
| <span class="sd"> >>> model.vocabSize()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.describeTopics()</span> |
| <span class="sd"> [([1, 0], [0.5..., 0.49...]), ([0, 1], [0.5..., 0.49...])]</span> |
| <span class="sd"> >>> model.describeTopics(1)</span> |
| <span class="sd"> [([1], [0.5...]), ([0], [0.5...])]</span> |
| |
| <span class="sd"> >>> topics = model.topicsMatrix()</span> |
| <span class="sd"> >>> topics_expect = array([[0.5, 0.5], [0.5, 0.5]])</span> |
| <span class="sd"> >>> assert_almost_equal(topics, topics_expect, 1)</span> |
| |
| <span class="sd"> >>> import os, tempfile</span> |
| <span class="sd"> >>> from shutil import rmtree</span> |
| <span class="sd"> >>> path = tempfile.mkdtemp()</span> |
| <span class="sd"> >>> model.save(sc, path)</span> |
| <span class="sd"> >>> sameModel = LDAModel.load(sc, path)</span> |
| <span class="sd"> >>> assert_equal(sameModel.topicsMatrix(), model.topicsMatrix())</span> |
| <span class="sd"> >>> sameModel.vocabSize() == model.vocabSize()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> try:</span> |
| <span class="sd"> ... rmtree(path)</span> |
| <span class="sd"> ... except OSError:</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="LDAModel.topicsMatrix"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDAModel.html#pyspark.mllib.clustering.LDAModel.topicsMatrix">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">topicsMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span> |
| <span class="sd">"""Inferred topics, where each topic is represented by a distribution over terms."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"topicsMatrix"</span><span class="p">)</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.vocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDAModel.html#pyspark.mllib.clustering.LDAModel.vocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">vocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""Vocabulary size (number of terms or terms in the vocabulary)"""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"vocabSize"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.describeTopics"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDAModel.html#pyspark.mllib.clustering.LDAModel.describeTopics">[docs]</a> <span class="k">def</span> <span class="nf">describeTopics</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]:</span> |
| <span class="sd">"""Return the topics described by weighted terms.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| <span class="sd"> .. warning:: If vocabSize and k are large, this can return a large object!</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> maxTermsPerTopic : int, optional</span> |
| <span class="sd"> Maximum number of terms to collect for each topic.</span> |
| <span class="sd"> (default: vocabulary size)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> Array over topics. Each topic is represented as a pair of</span> |
| <span class="sd"> matching arrays: (term indices, term weights in topic).</span> |
| <span class="sd"> Each topic's terms are sorted in order of decreasing weight.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">maxTermsPerTopic</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">topics</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"describeTopics"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">topics</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"describeTopics"</span><span class="p">,</span> <span class="n">maxTermsPerTopic</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">topics</span></div> |
| |
| <div class="viewcode-block" id="LDAModel.load"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDAModel.html#pyspark.mllib.clustering.LDAModel.load">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LDAModel"</span><span class="p">:</span> |
| <span class="sd">"""Load the LDAModel from disk.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sc : :py:class:`pyspark.SparkContext`</span> |
| <span class="sd"> path : str</span> |
| <span class="sd"> Path to where the model is stored.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">SparkContext</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"sc should be a SparkContext, got type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">sc</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"path should be a string, got type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">path</span><span class="p">))</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"loadLDAModel"</span><span class="p">,</span> <span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">LDAModel</span><span class="p">(</span><span class="n">model</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="LDA"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDA.html#pyspark.mllib.clustering.LDA">[docs]</a><span class="k">class</span> <span class="nc">LDA</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Train Latent Dirichlet Allocation (LDA) model.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="LDA.train"><a class="viewcode-back" href="../../../reference/api/pyspark.mllib.clustering.LDA.html#pyspark.mllib.clustering.LDA.train">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="k">def</span> <span class="nf">train</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">"VectorLike"</span><span class="p">]],</span> |
| <span class="n">k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">docConcentration</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">topicConcentration</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">checkpointInterval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> |
| <span class="n">optimizer</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"em"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">LDAModel</span><span class="p">:</span> |
| <span class="sd">"""Train a LDA model.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> RDD of documents, which are tuples of document IDs and term</span> |
| <span class="sd"> (word) count vectors. The term count vectors are "bags of</span> |
| <span class="sd"> words" with a fixed-size vocabulary (where the vocabulary size</span> |
| <span class="sd"> is the length of the vector). Document IDs must be unique</span> |
| <span class="sd"> and >= 0.</span> |
| <span class="sd"> k : int, optional</span> |
| <span class="sd"> Number of topics to infer, i.e., the number of soft cluster</span> |
| <span class="sd"> centers.</span> |
| <span class="sd"> (default: 10)</span> |
| <span class="sd"> maxIterations : int, optional</span> |
| <span class="sd"> Maximum number of iterations allowed.</span> |
| <span class="sd"> (default: 20)</span> |
| <span class="sd"> docConcentration : float, optional</span> |
| <span class="sd"> Concentration parameter (commonly named "alpha") for the prior</span> |
| <span class="sd"> placed on documents' distributions over topics ("theta").</span> |
| <span class="sd"> (default: -1.0)</span> |
| <span class="sd"> topicConcentration : float, optional</span> |
| <span class="sd"> Concentration parameter (commonly named "beta" or "eta") for</span> |
| <span class="sd"> the prior placed on topics' distributions over terms.</span> |
| <span class="sd"> (default: -1.0)</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Random seed for cluster initialization. Set as None to generate</span> |
| <span class="sd"> seed based on system time.</span> |
| <span class="sd"> (default: None)</span> |
| <span class="sd"> checkpointInterval : int, optional</span> |
| <span class="sd"> Period (in iterations) between checkpoints.</span> |
| <span class="sd"> (default: 10)</span> |
| <span class="sd"> optimizer : str, optional</span> |
| <span class="sd"> LDAOptimizer used to perform the actual calculation. Currently</span> |
| <span class="sd"> "em", "online" are supported.</span> |
| <span class="sd"> (default: "em")</span> |
| <span class="sd"> """</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span> |
| <span class="s2">"trainLDAModel"</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">,</span> |
| <span class="n">maxIterations</span><span class="p">,</span> |
| <span class="n">docConcentration</span><span class="p">,</span> |
| <span class="n">topicConcentration</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">,</span> |
| <span class="n">checkpointInterval</span><span class="p">,</span> |
| <span class="n">optimizer</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">LDAModel</span><span class="p">(</span><span class="n">model</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> |
| <span class="kn">import</span> <span class="nn">pyspark.mllib.clustering</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="c1"># Numpy 1.14+ changed it's string format.</span> |
| <span class="n">numpy</span><span class="o">.</span><span class="n">set_printoptions</span><span class="p">(</span><span class="n">legacy</span><span class="o">=</span><span class="s2">"1.13"</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">clustering</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">,</span> <span class="s2">"PythonTest"</span><span class="p">,</span> <span class="n">batchSize</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |