blob: 59ac28416da234e05764e021d93b524166546a1f [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>StreamingKMeans &#8212; PySpark 3.1.1 documentation</title>
<link rel="stylesheet" href="../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/language_data.js"></script>
<script src="../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.clustering.StreamingKMeans.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="StreamingKMeansModel" href="pyspark.mllib.clustering.StreamingKMeansModel.html" />
<link rel="prev" title="PowerIterationClustering" href="pyspark.mllib.clustering.PowerIterationClustering.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../index.html">
<img src="../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item active">
<a class="nav-link" href="../index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
<li class="">
<a href="../pyspark.sql.html">Spark SQL</a>
</li>
<li class="">
<a href="../pyspark.ss.html">Structured Streaming</a>
</li>
<li class="">
<a href="../pyspark.ml.html">MLlib (DataFrame-based)</a>
</li>
<li class="">
<a href="../pyspark.streaming.html">Spark Streaming</a>
</li>
<li class="active">
<a href="../pyspark.mllib.html">MLlib (RDD-based)</a>
</li>
<li class="">
<a href="../pyspark.html">Spark Core</a>
</li>
<li class="">
<a href="../pyspark.resource.html">Resource Management</a>
</li>
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="streamingkmeans">
<h1>StreamingKMeans<a class="headerlink" href="#streamingkmeans" title="Permalink to this headline"></a></h1>
<dl class="py class">
<dt id="pyspark.mllib.clustering.StreamingKMeans">
<em class="property">class </em><code class="sig-prename descclassname">pyspark.mllib.clustering.</code><code class="sig-name descname">StreamingKMeans</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">k</span><span class="o">=</span><span class="default_value">2</span></em>, <em class="sig-param"><span class="n">decayFactor</span><span class="o">=</span><span class="default_value">1.0</span></em>, <em class="sig-param"><span class="n">timeUnit</span><span class="o">=</span><span class="default_value">'batches'</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans" title="Permalink to this definition"></a></dt>
<dd><p>Provides methods to set k, decayFactor, timeUnit to configure the
KMeans algorithm for fitting and predicting on incoming dstreams.
More details on how the centroids are updated are provided under the
docs of StreamingKMeansModel.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl class="simple">
<dt><strong>k</strong><span class="classifier">int, optional</span></dt><dd><p>Number of clusters.
(default: 2)</p>
</dd>
<dt><strong>decayFactor</strong><span class="classifier">float, optional</span></dt><dd><p>Forgetfulness of the previous centroids.
(default: 1.0)</p>
</dd>
<dt><strong>timeUnit</strong><span class="classifier">str, optional</span></dt><dd><p>Can be “batches” or “points”. If points, then the decay factor is
raised to the power of number of new points and if batches, then
decay factor will be used as is.
(default: “batches”)</p>
</dd>
</dl>
</dd>
</dl>
<p class="rubric">Methods</p>
<table class="longtable table autosummary">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.latestModel" title="pyspark.mllib.clustering.StreamingKMeans.latestModel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">latestModel</span></code></a>()</p></td>
<td><p>Return the latest model</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.predictOn" title="pyspark.mllib.clustering.StreamingKMeans.predictOn"><code class="xref py py-obj docutils literal notranslate"><span class="pre">predictOn</span></code></a>(dstream)</p></td>
<td><p>Make predictions on a dstream.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.predictOnValues" title="pyspark.mllib.clustering.StreamingKMeans.predictOnValues"><code class="xref py py-obj docutils literal notranslate"><span class="pre">predictOnValues</span></code></a>(dstream)</p></td>
<td><p>Make predictions on a keyed dstream.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.setDecayFactor" title="pyspark.mllib.clustering.StreamingKMeans.setDecayFactor"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setDecayFactor</span></code></a>(decayFactor)</p></td>
<td><p>Set decay factor.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.setHalfLife" title="pyspark.mllib.clustering.StreamingKMeans.setHalfLife"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setHalfLife</span></code></a>(halfLife, timeUnit)</p></td>
<td><p>Set number of batches after which the centroids of that particular batch has half the weightage.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.setInitialCenters" title="pyspark.mllib.clustering.StreamingKMeans.setInitialCenters"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setInitialCenters</span></code></a>(centers, weights)</p></td>
<td><p>Set initial centers.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.setK" title="pyspark.mllib.clustering.StreamingKMeans.setK"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setK</span></code></a>(k)</p></td>
<td><p>Set number of clusters.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.setRandomCenters" title="pyspark.mllib.clustering.StreamingKMeans.setRandomCenters"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setRandomCenters</span></code></a>(dim, weight, seed)</p></td>
<td><p>Set the initial centers to be random samples from a gaussian population with constant weights.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.mllib.clustering.StreamingKMeans.trainOn" title="pyspark.mllib.clustering.StreamingKMeans.trainOn"><code class="xref py py-obj docutils literal notranslate"><span class="pre">trainOn</span></code></a>(dstream)</p></td>
<td><p>Train the model on the incoming dstream.</p></td>
</tr>
</tbody>
</table>
<p class="rubric">Methods Documentation</p>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.latestModel">
<code class="sig-name descname">latestModel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.latestModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.latestModel" title="Permalink to this definition"></a></dt>
<dd><p>Return the latest model</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.predictOn">
<code class="sig-name descname">predictOn</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dstream</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.predictOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.predictOn" title="Permalink to this definition"></a></dt>
<dd><p>Make predictions on a dstream.
Returns a transformed dstream object</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.predictOnValues">
<code class="sig-name descname">predictOnValues</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dstream</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.predictOnValues"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.predictOnValues" title="Permalink to this definition"></a></dt>
<dd><p>Make predictions on a keyed dstream.
Returns a transformed dstream object.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setDecayFactor">
<code class="sig-name descname">setDecayFactor</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">decayFactor</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.setDecayFactor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setDecayFactor" title="Permalink to this definition"></a></dt>
<dd><p>Set decay factor.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setHalfLife">
<code class="sig-name descname">setHalfLife</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">halfLife</span></em>, <em class="sig-param"><span class="n">timeUnit</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.setHalfLife"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setHalfLife" title="Permalink to this definition"></a></dt>
<dd><p>Set number of batches after which the centroids of that
particular batch has half the weightage.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setInitialCenters">
<code class="sig-name descname">setInitialCenters</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">centers</span></em>, <em class="sig-param"><span class="n">weights</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.setInitialCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setInitialCenters" title="Permalink to this definition"></a></dt>
<dd><p>Set initial centers. Should be set before calling trainOn.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setK">
<code class="sig-name descname">setK</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">k</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setK" title="Permalink to this definition"></a></dt>
<dd><p>Set number of clusters.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setRandomCenters">
<code class="sig-name descname">setRandomCenters</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dim</span></em>, <em class="sig-param"><span class="n">weight</span></em>, <em class="sig-param"><span class="n">seed</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.setRandomCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setRandomCenters" title="Permalink to this definition"></a></dt>
<dd><p>Set the initial centers to be random samples from
a gaussian population with constant weights.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.trainOn">
<code class="sig-name descname">trainOn</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dstream</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/mllib/clustering.html#StreamingKMeans.trainOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.trainOn" title="Permalink to this definition"></a></dt>
<dd><p>Train the model on the incoming dstream.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="pyspark.mllib.clustering.PowerIterationClustering.html" title="previous page">PowerIterationClustering</a>
<a class='right-next' id="next-link" href="pyspark.mllib.clustering.StreamingKMeansModel.html" title="next page">StreamingKMeansModel</a>
</div>
</main>
</div>
</div>
<script src="../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>