blob: baf08041a9653e46227bb7e1944580abffebc821 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>pyspark.mllib package &#8212; PySpark 2.2.1 documentation</title>
<link rel="stylesheet" href="_static/nature.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/pyspark.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: './',
VERSION: '2.2.1',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: '.txt'
};
</script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="_static/pyspark.js"></script>
<link rel="search" title="Search" href="search.html" />
<link rel="prev" title="pyspark.ml package" href="pyspark.ml.html" />
</head>
<body>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="pyspark.ml.html" title="pyspark.ml package"
accesskey="P">previous</a></li>
<li class="nav-item nav-item-0"><a href="index.html">PySpark 2.2.1 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="pyspark.html" accesskey="U">pyspark package</a> &#187;</li>
</ul>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<div class="section" id="pyspark-mllib-package">
<h1>pyspark.mllib package<a class="headerlink" href="#pyspark-mllib-package" title="Permalink to this headline"></a></h1>
<div class="section" id="module-pyspark.mllib.classification">
<span id="pyspark-mllib-classification-module"></span><h2>pyspark.mllib.classification module<a class="headerlink" href="#module-pyspark.mllib.classification" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.classification.LogisticRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">LogisticRegressionModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em>, <em>numFeatures</em>, <em>numClasses</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Classification model trained using Multinomial/Binary Logistic
Regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>weights</strong> – Weights computed for every feature.</li>
<li><strong>intercept</strong> – Intercept computed for this model. (Only used in Binary Logistic
Regression. In Multinomial Logistic Regression, the intercepts will
not bea single value, so the intercepts will be part of the
weights.)</li>
<li><strong>numFeatures</strong> – The dimension of the features.</li>
<li><strong>numClasses</strong> – The number of possible outcomes for k classes classification problem
in Multinomial Logistic Regression. By default, it is binary
logistic regression so numClasses will be set to 2.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LogisticRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1, 0]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">clearThreshold</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">0.279...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LogisticRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]))</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">LogisticRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">multi_class_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">multi_class_data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mcm</span> <span class="o">=</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">numClasses</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mcm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mcm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.8</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mcm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.3</span><span class="p">])</span>
<span class="go">2</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.clearThreshold">
<code class="descname">clearThreshold</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.clearThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Clears the threshold so that <cite>predict</cite> will output raw
prediction scores. It is used for binary classification only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of possible outcomes for k classes classification problem
in Multinomial Logistic Regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Dimension of the features.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict values for a single data point or an RDD of points
using the model trained.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.setThreshold">
<code class="descname">setThreshold</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.setThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Sets the threshold that separates positive predictions from
negative predictions. An example with prediction score greater
than or equal to this threshold is identified as a positive,
and negative otherwise. It is used for binary classification
only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.threshold">
<code class="descname">threshold</code><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.threshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns the threshold (if any) used for converting raw
prediction scores into 0/1 predictions. It is used for
binary classification only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.LogisticRegressionModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.LogisticRegressionWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">LogisticRegressionWithSGD</code><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionWithSGD" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.0.0. Use ml.classification.LogisticRegression or
LogisticRegressionWithLBFGS.</p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.LogisticRegressionWithSGD.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>step=1.0</em>, <em>miniBatchFraction=1.0</em>, <em>initialWeights=None</em>, <em>regParam=0.01</em>, <em>regType='l2'</em>, <em>intercept=False</em>, <em>validateData=True</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionWithSGD.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionWithSGD.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a logistic regression model on the given data.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>step</strong> – The step parameter used in SGD.
(default: 1.0)</li>
<li><strong>miniBatchFraction</strong> – Fraction of data to be used for each SGD iteration.
(default: 1.0)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.01)</li>
<li><strong>regType</strong><p>The type of regularizer used for training our model.
Supported values:</p>
<blockquote>
<div><ul>
<li>”l1” for using L1 regularization</li>
<li>”l2” for using L2 regularization (default)</li>
<li>None for no regularization</li>
</ul>
</div></blockquote>
</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e., whether bias
features are activated or not).
(default: False)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>convergenceTol</strong> – A condition which decides iteration termination.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.LogisticRegressionWithLBFGS">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">LogisticRegressionWithLBFGS</code><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionWithLBFGS"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionWithLBFGS" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.LogisticRegressionWithLBFGS.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>initialWeights=None</em>, <em>regParam=0.0</em>, <em>regType='l2'</em>, <em>intercept=False</em>, <em>corrections=10</em>, <em>tolerance=1e-06</em>, <em>validateData=True</em>, <em>numClasses=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#LogisticRegressionWithLBFGS.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.LogisticRegressionWithLBFGS.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a logistic regression model on the given data.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.0)</li>
<li><strong>regType</strong><p>The type of regularizer used for training our model.
Supported values:</p>
<blockquote>
<div><ul>
<li>”l1” for using L1 regularization</li>
<li>”l2” for using L2 regularization (default)</li>
<li>None for no regularization</li>
</ul>
</div></blockquote>
</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e., whether bias
features are activated or not).
(default: False)</li>
<li><strong>corrections</strong> – The number of corrections used in the LBFGS update.
If a known updater is used for binary classification,
it calls the ml implementation and this parameter will
have no effect. (default: 10)</li>
<li><strong>tolerance</strong> – The convergence tolerance of iterations for L-BFGS.
(default: 1e-6)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>numClasses</strong> – The number of classes (i.e., outcomes) a label can take in
Multinomial Logistic Regression.
(default: 2)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.SVMModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">SVMModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMModel" title="Permalink to this definition"></a></dt>
<dd><p>Model for Support Vector Machines (SVMs).</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>weights</strong> – Weights computed for every feature.</li>
<li><strong>intercept</strong> – Intercept computed for this model.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">SVMWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">1.0</span><span class="p">])</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">clearThreshold</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="go">1.44...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">SVMWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">SVMModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.classification.SVMModel.clearThreshold">
<code class="descname">clearThreshold</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.clearThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Clears the threshold so that <cite>predict</cite> will output raw
prediction scores. It is used for binary classification only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.SVMModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.SVMModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.SVMModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict values for a single data point or an RDD of points
using the model trained.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.SVMModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.SVMModel.setThreshold">
<code class="descname">setThreshold</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.setThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Sets the threshold that separates positive predictions from
negative predictions. An example with prediction score greater
than or equal to this threshold is identified as a positive,
and negative otherwise. It is used for binary classification
only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.SVMModel.threshold">
<code class="descname">threshold</code><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.threshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns the threshold (if any) used for converting raw
prediction scores into 0/1 predictions. It is used for
binary classification only.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.classification.SVMModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.classification.SVMModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.SVMWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">SVMWithSGD</code><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMWithSGD" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.SVMWithSGD.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>step=1.0</em>, <em>regParam=0.01</em>, <em>miniBatchFraction=1.0</em>, <em>initialWeights=None</em>, <em>regType='l2'</em>, <em>intercept=False</em>, <em>validateData=True</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#SVMWithSGD.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.SVMWithSGD.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a support vector machine on the given data.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>step</strong> – The step parameter used in SGD.
(default: 1.0)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.01)</li>
<li><strong>miniBatchFraction</strong> – Fraction of data to be used for each SGD iteration.
(default: 1.0)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>regType</strong><p>The type of regularizer used for training our model.
Allowed values:</p>
<blockquote>
<div><ul>
<li>”l1” for using L1 regularization</li>
<li>”l2” for using L2 regularization (default)</li>
<li>None for no regularization</li>
</ul>
</div></blockquote>
</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e. whether bias
features are activated or not).
(default: False)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>convergenceTol</strong> – A condition which decides iteration termination.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.NaiveBayesModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">NaiveBayesModel</code><span class="sig-paren">(</span><em>labels</em>, <em>pi</em>, <em>theta</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayesModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayesModel" title="Permalink to this definition"></a></dt>
<dd><p>Model for Naive Bayes classifiers.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>labels</strong> – List of labels.</li>
<li><strong>pi</strong> – Log of class priors, whose dimension is C, number of labels.</li>
<li><strong>theta</strong> – Log of class conditional probabilities, whose dimension is C-by-D,
where D is number of features.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">NaiveBayesModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.NaiveBayesModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayesModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayesModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.NaiveBayesModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayesModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayesModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Return the most likely class for a data vector
or an RDD of vectors</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.NaiveBayesModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayesModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayesModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.NaiveBayes">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">NaiveBayes</code><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayes"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayes" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.classification.NaiveBayes.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>lambda_=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#NaiveBayes.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.NaiveBayes.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a Naive Bayes model given an RDD of (label, features)
vectors.</p>
<p>This is the Multinomial NB (U{<a class="reference external" href="http://tinyurl.com/lsdw6p">http://tinyurl.com/lsdw6p</a>}) which
can handle all kinds of discrete data. For example, by
converting documents into TF-IDF vectors, it can be used for
document classification. By making every vector a 0-1 vector,
it can also be used as Bernoulli NB (U{<a class="reference external" href="http://tinyurl.com/p7c96j6">http://tinyurl.com/p7c96j6</a>}).
The input feature values must be nonnegative.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – RDD of LabeledPoint.</li>
<li><strong>lambda</strong> – The smoothing parameter.
(default: 1.0)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.classification.</code><code class="descname">StreamingLogisticRegressionWithSGD</code><span class="sig-paren">(</span><em>stepSize=0.1</em>, <em>numIterations=50</em>, <em>miniBatchFraction=1.0</em>, <em>regParam=0.0</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#StreamingLogisticRegressionWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD" title="Permalink to this definition"></a></dt>
<dd><p>Train or predict a logistic regression model on streaming data.
Training uses Stochastic Gradient Descent to update the model based on
each new batch of incoming data from a DStream.</p>
<p>Each batch of data is assumed to be an RDD of LabeledPoints.
The number of data points per batch can vary, but the number
of features must be constant. An initial weight
vector must be provided.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>stepSize</strong> – Step size for each iteration of gradient descent.
(default: 0.1)</li>
<li><strong>numIterations</strong> – Number of iterations run for each batch of data.
(default: 50)</li>
<li><strong>miniBatchFraction</strong> – Fraction of each batch of data to use for updates.
(default: 1.0)</li>
<li><strong>regParam</strong> – L2 Regularization parameter.
(default: 0.0)</li>
<li><strong>convergenceTol</strong> – Value used to determine when to terminate iterations.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.latestModel">
<code class="descname">latestModel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.latestModel" title="Permalink to this definition"></a></dt>
<dd><p>Returns the latest model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.predictOn">
<code class="descname">predictOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.predictOn" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on batches of data from a
DStream.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing predictions.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.predictOnValues">
<code class="descname">predictOnValues</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.predictOnValues" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on the values of a DStream and
carry over its keys.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing the input keys and the predictions as values.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.setInitialWeights">
<code class="descname">setInitialWeights</code><span class="sig-paren">(</span><em>initialWeights</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#StreamingLogisticRegressionWithSGD.setInitialWeights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.setInitialWeights" title="Permalink to this definition"></a></dt>
<dd><p>Set the initial value of weights.</p>
<p>This must be set before running trainOn and predictOn.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.trainOn">
<code class="descname">trainOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/classification.html#StreamingLogisticRegressionWithSGD.trainOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.trainOn" title="Permalink to this definition"></a></dt>
<dd><p>Train the model on the incoming dstream.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.clustering">
<span id="pyspark-mllib-clustering-module"></span><h2>pyspark.mllib.clustering module<a class="headerlink" href="#module-pyspark.mllib.clustering" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.clustering.BisectingKMeansModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">BisectingKMeansModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#BisectingKMeansModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeansModel" title="Permalink to this definition"></a></dt>
<dd><p>A clustering model derived from the bisecting k-means method.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">,</span><span class="mf">9.0</span><span class="p">])</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bskm</span> <span class="o">=</span> <span class="n">BisectingKMeans</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">bskm</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="n">k</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">p</span> <span class="o">=</span> <span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">p</span><span class="p">)</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">k</span>
<span class="go">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">computeCost</span><span class="p">(</span><span class="n">p</span><span class="p">)</span>
<span class="go">0.0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.BisectingKMeansModel.clusterCenters">
<code class="descname">clusterCenters</code><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeansModel.clusterCenters" title="Permalink to this definition"></a></dt>
<dd><p>Get the cluster centers, represented as a list of NumPy
arrays.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.BisectingKMeansModel.computeCost">
<code class="descname">computeCost</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#BisectingKMeansModel.computeCost"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeansModel.computeCost" title="Permalink to this definition"></a></dt>
<dd><p>Return the Bisecting K-means cost (sum of squared distances of
points to their nearest center) for this model on the given
data. If provided with an RDD of points returns the sum.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>point</strong> – A data point (or RDD of points) to compute the cost(s).</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.BisectingKMeansModel.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeansModel.k" title="Permalink to this definition"></a></dt>
<dd><p>Get the number of clusters</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.BisectingKMeansModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#BisectingKMeansModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeansModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Find the cluster that each of the points belongs to in this
model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – A data point (or RDD of points) to determine cluster index.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Predicted cluster index or an RDD of predicted cluster indices
if the input is an RDD.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.BisectingKMeans">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">BisectingKMeans</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#BisectingKMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeans" title="Permalink to this definition"></a></dt>
<dd><p>A bisecting k-means algorithm based on the paper “A comparison of
document clustering techniques” by Steinbach, Karypis, and Kumar,
with modification to fit Spark.
The algorithm starts from a single cluster that contains all points.
Iteratively it finds divisible clusters on the bottom level and
bisects each of them using k-means, until there are <cite>k</cite> leaf
clusters in total or no leaf clusters are divisible.
The bisecting steps of clusters on the same level are grouped
together to increase parallelism. If bisecting all divisible
clusters on the bottom level would result more than <cite>k</cite> leaf
clusters, larger clusters get higher priority.</p>
<p>Based on
U{<a class="reference external" href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf</a>}
Steinbach, Karypis, and Kumar, A comparison of document clustering
techniques, KDD Workshop on Text Mining, 2000.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.BisectingKMeans.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>rdd</em>, <em>k=4</em>, <em>maxIterations=20</em>, <em>minDivisibleClusterSize=1.0</em>, <em>seed=-1888008604</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#BisectingKMeans.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.BisectingKMeans.train" title="Permalink to this definition"></a></dt>
<dd><p>Runs the bisecting k-means algorithm return the model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rdd</strong> – Training points as an <cite>RDD</cite> of <cite>Vector</cite> or convertible
sequence types.</li>
<li><strong>k</strong> – The desired number of leaf clusters. The actual number could
be smaller if there are no divisible leaf clusters.
(default: 4)</li>
<li><strong>maxIterations</strong> – Maximum number of iterations allowed to split clusters.
(default: 20)</li>
<li><strong>minDivisibleClusterSize</strong> – Minimum number of points (if &gt;= 1.0) or the minimum proportion
of points (if &lt; 1.0) of a divisible cluster.
(default: 1)</li>
<li><strong>seed</strong> – Random seed value for cluster initialization.
(default: -1888008604 from classOf[BisectingKMeans].getName.##)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.KMeansModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">KMeansModel</code><span class="sig-paren">(</span><em>centers</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeansModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel" title="Permalink to this definition"></a></dt>
<dd><p>A clustering model derived from the k-means method.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">,</span><span class="mf">9.0</span><span class="p">])</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="o">.</span><span class="n">train</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="n">maxIterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">initializationMode</span><span class="o">=</span><span class="s2">&quot;random&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">seed</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">initializationSteps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">epsilon</span><span class="o">=</span><span class="mf">1e-4</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]))</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">]))</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">9.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">]))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">k</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">computeCost</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">))</span>
<span class="go">2.0000000000000004</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}),</span>
<span class="gp">... </span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.1</span><span class="p">}),</span>
<span class="gp">... </span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">2</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}),</span>
<span class="gp">... </span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">2</span><span class="p">:</span> <span class="mf">1.1</span><span class="p">})</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="n">initializationMode</span><span class="o">=</span><span class="s2">&quot;k-means||&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">seed</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">initializationSteps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">epsilon</span><span class="o">=</span><span class="mf">1e-4</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">,</span> <span class="mf">0.</span><span class="p">]))</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mf">1.1</span><span class="p">,</span> <span class="mf">0.</span><span class="p">]))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.</span><span class="p">,</span> <span class="mf">0.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">]))</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.1</span><span class="p">]))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">3</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">KMeansModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">array</span><span class="p">([</span><span class="o">-</span><span class="mf">383.1</span><span class="p">,</span><span class="o">-</span><span class="mf">382.9</span><span class="p">,</span> <span class="mf">28.7</span><span class="p">,</span><span class="mf">31.2</span><span class="p">,</span> <span class="mf">366.2</span><span class="p">,</span><span class="mf">367.3</span><span class="p">])</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="mi">3</span><span class="p">,</span> <span class="n">maxIterations</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialModel</span> <span class="o">=</span> <span class="n">KMeansModel</span><span class="p">([(</span><span class="o">-</span><span class="mf">1000.0</span><span class="p">,</span><span class="o">-</span><span class="mf">1000.0</span><span class="p">),(</span><span class="mf">5.0</span><span class="p">,</span><span class="mf">5.0</span><span class="p">),(</span><span class="mf">1000.0</span><span class="p">,</span><span class="mf">1000.0</span><span class="p">)]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span>
<span class="go">[array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.KMeansModel.clusterCenters">
<code class="descname">clusterCenters</code><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.clusterCenters" title="Permalink to this definition"></a></dt>
<dd><p>Get the cluster centers, represented as a list of NumPy arrays.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.KMeansModel.computeCost">
<code class="descname">computeCost</code><span class="sig-paren">(</span><em>rdd</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeansModel.computeCost"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.computeCost" title="Permalink to this definition"></a></dt>
<dd><p>Return the K-means cost (sum of squared distances of points to
their nearest center) for this model on the given
data.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rdd</strong> – The RDD of points to compute the cost on.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.KMeansModel.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.k" title="Permalink to this definition"></a></dt>
<dd><p>Total number of clusters.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.KMeansModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeansModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.KMeansModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeansModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Find the cluster that each of the points belongs to in this
model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – A data point (or RDD of points) to determine cluster index.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Predicted cluster index or an RDD of predicted cluster indices
if the input is an RDD.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.KMeansModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeansModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeansModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.KMeans">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">KMeans</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeans" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.KMeans.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>rdd</em>, <em>k</em>, <em>maxIterations=100</em>, <em>runs=1</em>, <em>initializationMode='k-means||'</em>, <em>seed=None</em>, <em>initializationSteps=2</em>, <em>epsilon=0.0001</em>, <em>initialModel=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#KMeans.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.KMeans.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a k-means clustering model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rdd</strong> – Training points as an <cite>RDD</cite> of <cite>Vector</cite> or convertible
sequence types.</li>
<li><strong>k</strong> – Number of clusters to create.</li>
<li><strong>maxIterations</strong> – Maximum number of iterations allowed.
(default: 100)</li>
<li><strong>runs</strong> – This param has no effect since Spark 2.0.0.</li>
<li><strong>initializationMode</strong> – The initialization algorithm. This can be either “random” or
“k-means||”.
(default: “k-means||”)</li>
<li><strong>seed</strong> – Random seed value for cluster initialization. Set as None to
generate seed based on system time.
(default: None)</li>
<li><strong>initializationSteps</strong> – Number of steps for the k-means|| initialization mode.
This is an advanced setting – the default of 2 is almost
always enough.
(default: 2)</li>
<li><strong>epsilon</strong> – Distance threshold within which a center will be considered to
have converged. If all centers move less than this Euclidean
distance, iterations are stopped.
(default: 1e-4)</li>
<li><strong>initialModel</strong> – Initial cluster centers can be provided as a KMeansModel object
rather than using the random or k-means|| initializationModel.
(default: None)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">GaussianMixtureModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixtureModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel" title="Permalink to this definition"></a></dt>
<dd><p>A clustering model derived from the Gaussian Mixture Model method.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span><span class="p">,</span> <span class="n">DenseMatrix</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy.testing</span> <span class="k">import</span> <span class="n">assert_equal</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">clusterdata_1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span><span class="o">-</span><span class="mf">0.05</span><span class="p">,</span><span class="o">-</span><span class="mf">0.01</span><span class="p">,</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span>
<span class="gp">... </span> <span class="mf">0.9</span><span class="p">,</span><span class="mf">0.8</span><span class="p">,</span><span class="mf">0.75</span><span class="p">,</span><span class="mf">0.935</span><span class="p">,</span>
<span class="gp">... </span> <span class="o">-</span><span class="mf">0.83</span><span class="p">,</span><span class="o">-</span><span class="mf">0.68</span><span class="p">,</span><span class="o">-</span><span class="mf">0.91</span><span class="p">,</span><span class="o">-</span><span class="mf">0.76</span> <span class="p">])</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">GaussianMixture</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">clusterdata_1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">convergenceTol</span><span class="o">=</span><span class="mf">0.0001</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">maxIterations</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">clusterdata_1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">5</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span><span class="o">-</span><span class="mf">0.05</span><span class="p">])</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">softPredicted</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predictSoft</span><span class="p">([</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span><span class="o">-</span><span class="mf">0.05</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">softPredicted</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">softPredicted</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="mf">0.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">softPredicted</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> <span class="o">-</span> <span class="mf">0.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">GaussianMixtureModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">assert_equal</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">weights</span><span class="p">,</span> <span class="n">sameModel</span><span class="o">.</span><span class="n">weights</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mus</span><span class="p">,</span> <span class="n">sigmas</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
<span class="gp">... </span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="p">[(</span><span class="n">g</span><span class="o">.</span><span class="n">mu</span><span class="p">,</span> <span class="n">g</span><span class="o">.</span><span class="n">sigma</span><span class="p">)</span> <span class="k">for</span> <span class="n">g</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">gaussians</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameMus</span><span class="p">,</span> <span class="n">sameSigmas</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
<span class="gp">... </span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="p">[(</span><span class="n">g</span><span class="o">.</span><span class="n">mu</span><span class="p">,</span> <span class="n">g</span><span class="o">.</span><span class="n">sigma</span><span class="p">)</span> <span class="k">for</span> <span class="n">g</span> <span class="ow">in</span> <span class="n">sameModel</span><span class="o">.</span><span class="n">gaussians</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mus</span> <span class="o">==</span> <span class="n">sameMus</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sigmas</span> <span class="o">==</span> <span class="n">sameSigmas</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">array</span><span class="p">([</span><span class="o">-</span><span class="mf">5.1971</span><span class="p">,</span> <span class="o">-</span><span class="mf">2.5359</span><span class="p">,</span> <span class="o">-</span><span class="mf">3.8220</span><span class="p">,</span>
<span class="gp">... </span> <span class="o">-</span><span class="mf">5.2211</span><span class="p">,</span> <span class="o">-</span><span class="mf">5.0602</span><span class="p">,</span> <span class="mf">4.7118</span><span class="p">,</span>
<span class="gp">... </span> <span class="mf">6.8989</span><span class="p">,</span> <span class="mf">3.4592</span><span class="p">,</span> <span class="mf">4.6322</span><span class="p">,</span>
<span class="gp">... </span> <span class="mf">5.7048</span><span class="p">,</span> <span class="mf">4.6567</span><span class="p">,</span> <span class="mf">5.5026</span><span class="p">,</span>
<span class="gp">... </span> <span class="mf">4.5605</span><span class="p">,</span> <span class="mf">5.2043</span><span class="p">,</span> <span class="mf">6.2734</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">clusterdata_2</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span><span class="mi">3</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">GaussianMixture</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">clusterdata_2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">convergenceTol</span><span class="o">=</span><span class="mf">0.0001</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">maxIterations</span><span class="o">=</span><span class="mi">150</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">clusterdata_2</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">==</span><span class="n">labels</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.gaussians">
<code class="descname">gaussians</code><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.gaussians" title="Permalink to this definition"></a></dt>
<dd><p>Array of MultivariateGaussian where gaussians[i] represents
the Multivariate Gaussian (Normal) Distribution for Gaussian i.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.k" title="Permalink to this definition"></a></dt>
<dd><p>Number of gaussians in mixture.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixtureModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load the GaussianMixtureModel from disk.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>sc</strong> – SparkContext.</li>
<li><strong>path</strong> – Path to where the model is stored.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixtureModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Find the cluster to which the point ‘x’ or each point in RDD ‘x’
has maximum membership in this model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – A feature vector or an RDD of vectors representing data points.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Predicted cluster label or an RDD of predicted cluster labels
if the input is an RDD.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.predictSoft">
<code class="descname">predictSoft</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixtureModel.predictSoft"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.predictSoft" title="Permalink to this definition"></a></dt>
<dd><p>Find the membership of point ‘x’ or each point in RDD ‘x’ to all mixture components.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – A feature vector or an RDD of vectors representing data points.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The membership value to all mixture components for vector ‘x’
or each vector in RDD ‘x’.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.GaussianMixtureModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixtureModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights for each Gaussian distribution in the mixture, where weights[i] is
the weight for Gaussian i, and weights.sum == 1.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.GaussianMixture">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">GaussianMixture</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixture"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixture" title="Permalink to this definition"></a></dt>
<dd><p>Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.GaussianMixture.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>rdd</em>, <em>k</em>, <em>convergenceTol=0.001</em>, <em>maxIterations=100</em>, <em>seed=None</em>, <em>initialModel=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#GaussianMixture.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.GaussianMixture.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a Gaussian Mixture clustering model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rdd</strong> – Training points as an <cite>RDD</cite> of <cite>Vector</cite> or convertible
sequence types.</li>
<li><strong>k</strong> – Number of independent Gaussians in the mixture model.</li>
<li><strong>convergenceTol</strong> – Maximum change in log-likelihood at which convergence is
considered to have occurred.
(default: 1e-3)</li>
<li><strong>maxIterations</strong> – Maximum number of iterations allowed.
(default: 100)</li>
<li><strong>seed</strong> – Random seed for initial Gaussian distribution. Set as None to
generate seed based on system time.
(default: None)</li>
<li><strong>initialModel</strong> – Initial GMM starting point, bypassing the random
initialization.
(default: None)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.PowerIterationClusteringModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">PowerIterationClusteringModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClusteringModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClusteringModel" title="Permalink to this definition"></a></dt>
<dd><p>Model produced by [[PowerIterationClustering]].</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">math</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">genCircle</span><span class="p">(</span><span class="n">r</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span>
<span class="gp">... </span> <span class="n">points</span> <span class="o">=</span> <span class="p">[]</span>
<span class="gp">... </span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span>
<span class="gp">... </span> <span class="n">theta</span> <span class="o">=</span> <span class="mf">2.0</span> <span class="o">*</span> <span class="n">math</span><span class="o">.</span><span class="n">pi</span> <span class="o">*</span> <span class="n">i</span> <span class="o">/</span> <span class="n">n</span>
<span class="gp">... </span> <span class="n">points</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">r</span> <span class="o">*</span> <span class="n">math</span><span class="o">.</span><span class="n">cos</span><span class="p">(</span><span class="n">theta</span><span class="p">),</span> <span class="n">r</span> <span class="o">*</span> <span class="n">math</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">theta</span><span class="p">)))</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">points</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">sim</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">):</span>
<span class="gp">... </span> <span class="n">dist2</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="n">y</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">*</span> <span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="n">y</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="n">y</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="o">*</span> <span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="n">y</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">math</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">dist2</span> <span class="o">/</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">n1</span> <span class="o">=</span> <span class="mi">10</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="mf">4.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">n2</span> <span class="o">=</span> <span class="mi">40</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">n</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">points</span> <span class="o">=</span> <span class="n">genCircle</span><span class="p">(</span><span class="n">r1</span><span class="p">,</span> <span class="n">n1</span><span class="p">)</span> <span class="o">+</span> <span class="n">genCircle</span><span class="p">(</span><span class="n">r2</span><span class="p">,</span> <span class="n">n2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">similarities</span> <span class="o">=</span> <span class="p">[(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">,</span> <span class="n">sim</span><span class="p">(</span><span class="n">points</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">points</span><span class="p">[</span><span class="n">j</span><span class="p">]))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">n</span><span class="p">)</span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">i</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">similarities</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">PowerIterationClustering</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">40</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">k</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">assignments</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">5</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">7</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">PowerIterationClusteringModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">k</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">assignments</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">5</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span> <span class="o">==</span> <span class="n">result</span><span class="p">[</span><span class="mi">7</span><span class="p">]</span><span class="o">.</span><span class="n">cluster</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.clustering.PowerIterationClusteringModel.assignments">
<code class="descname">assignments</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClusteringModel.assignments"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClusteringModel.assignments" title="Permalink to this definition"></a></dt>
<dd><p>Returns the cluster assignments of this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.PowerIterationClusteringModel.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClusteringModel.k" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of clusters.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.PowerIterationClusteringModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClusteringModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClusteringModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.PowerIterationClustering">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">PowerIterationClustering</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClustering"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClustering" title="Permalink to this definition"></a></dt>
<dd><p>Power Iteration Clustering (PIC), a scalable graph clustering algorithm
developed by [[<a class="reference external" href="http://www.icml2010.org/papers/387.pdf">http://www.icml2010.org/papers/387.pdf</a> Lin and Cohen]].
From the abstract: PIC finds a very low-dimensional embedding of a
dataset using truncated power iteration on a normalized pair-wise
similarity matrix of the data.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="class">
<dt id="pyspark.mllib.clustering.PowerIterationClustering.Assignment">
<em class="property">class </em><code class="descname">Assignment</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClustering.Assignment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClustering.Assignment" title="Permalink to this definition"></a></dt>
<dd><p>Represents an (id, cluster) tuple.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.PowerIterationClustering.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>rdd</em>, <em>k</em>, <em>maxIterations=100</em>, <em>initMode='random'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#PowerIterationClustering.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.PowerIterationClustering.train" title="Permalink to this definition"></a></dt>
<dd><table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rdd</strong> – An RDD of (i, j, s<sub>ij</sub>) tuples representing the
affinity matrix, which is the matrix A in the PIC paper. The
similarity s<sub>ij</sub>must be nonnegative. This is a symmetric
matrix and hence s<sub>ij</sub>= s<sub>ji</sub> For any (i, j) with
nonzero similarity, there should be either (i, j, s<sub>ij</sub>) or
(j, i, s<sub>ji</sub>) in the input. Tuples with i = j are ignored,
because it is assumed s<sub>ij</sub>= 0.0.</li>
<li><strong>k</strong> – Number of clusters.</li>
<li><strong>maxIterations</strong> – Maximum number of iterations of the PIC algorithm.
(default: 100)</li>
<li><strong>initMode</strong> – Initialization mode. This can be either “random” to use
a random vector as vertex properties, or “degree” to use
normalized sum similarities.
(default: “random”)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.StreamingKMeans">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">StreamingKMeans</code><span class="sig-paren">(</span><em>k=2</em>, <em>decayFactor=1.0</em>, <em>timeUnit='batches'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans" title="Permalink to this definition"></a></dt>
<dd><p>Provides methods to set k, decayFactor, timeUnit to configure the
KMeans algorithm for fitting and predicting on incoming dstreams.
More details on how the centroids are updated are provided under the
docs of StreamingKMeansModel.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>k</strong> – Number of clusters.
(default: 2)</li>
<li><strong>decayFactor</strong> – Forgetfulness of the previous centroids.
(default: 1.0)</li>
<li><strong>timeUnit</strong> – Can be “batches” or “points”. If points, then the decay factor is
raised to the power of number of new points and if batches, then
decay factor will be used as is.
(default: “batches”)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.latestModel">
<code class="descname">latestModel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.latestModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.latestModel" title="Permalink to this definition"></a></dt>
<dd><p>Return the latest model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.predictOn">
<code class="descname">predictOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.predictOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.predictOn" title="Permalink to this definition"></a></dt>
<dd><p>Make predictions on a dstream.
Returns a transformed dstream object</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.predictOnValues">
<code class="descname">predictOnValues</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.predictOnValues"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.predictOnValues" title="Permalink to this definition"></a></dt>
<dd><p>Make predictions on a keyed dstream.
Returns a transformed dstream object.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setDecayFactor">
<code class="descname">setDecayFactor</code><span class="sig-paren">(</span><em>decayFactor</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.setDecayFactor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setDecayFactor" title="Permalink to this definition"></a></dt>
<dd><p>Set decay factor.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setHalfLife">
<code class="descname">setHalfLife</code><span class="sig-paren">(</span><em>halfLife</em>, <em>timeUnit</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.setHalfLife"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setHalfLife" title="Permalink to this definition"></a></dt>
<dd><p>Set number of batches after which the centroids of that
particular batch has half the weightage.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setInitialCenters">
<code class="descname">setInitialCenters</code><span class="sig-paren">(</span><em>centers</em>, <em>weights</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.setInitialCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setInitialCenters" title="Permalink to this definition"></a></dt>
<dd><p>Set initial centers. Should be set before calling trainOn.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>k</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setK" title="Permalink to this definition"></a></dt>
<dd><p>Set number of clusters.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.setRandomCenters">
<code class="descname">setRandomCenters</code><span class="sig-paren">(</span><em>dim</em>, <em>weight</em>, <em>seed</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.setRandomCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.setRandomCenters" title="Permalink to this definition"></a></dt>
<dd><p>Set the initial centres to be random samples from
a gaussian population with constant weights.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeans.trainOn">
<code class="descname">trainOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeans.trainOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeans.trainOn" title="Permalink to this definition"></a></dt>
<dd><p>Train the model on the incoming dstream.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.StreamingKMeansModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">StreamingKMeansModel</code><span class="sig-paren">(</span><em>clusterCenters</em>, <em>clusterWeights</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeansModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeansModel" title="Permalink to this definition"></a></dt>
<dd><p>Clustering model which can perform an online update of the centroids.</p>
<p>The update formula for each centroid is given by</p>
<ul class="simple">
<li>c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t)</li>
<li>n_t+1 = n_t * a + m_t</li>
</ul>
<p>where</p>
<ul class="simple">
<li>c_t: Centroid at the n_th iteration.</li>
<li><dl class="first docutils">
<dt>n_t: Number of samples (or) weights associated with the centroid</dt>
<dd>at the n_th iteration.</dd>
</dl>
</li>
<li>x_t: Centroid of the new data closest to c_t.</li>
<li>m_t: Number of samples (or) weights of the new data closest to c_t</li>
<li>c_t+1: New centroid.</li>
<li>n_t+1: New number of weights.</li>
<li>a: Decay Factor, which gives the forgetfulness.</li>
</ul>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">If a is set to 1, it is the weighted mean of the previous
and new data. If it set to zero, the old centroids are completely
forgotten.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>clusterCenters</strong> – Initial cluster centers.</li>
<li><strong>clusterWeights</strong> – List of weights assigned to each cluster.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">initCenters</span> <span class="o">=</span> <span class="p">[[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">initWeights</span> <span class="o">=</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span> <span class="o">=</span> <span class="n">StreamingKMeansModel</span><span class="p">(</span><span class="n">initCenters</span><span class="p">,</span> <span class="n">initWeights</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.1</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mf">0.9</span><span class="p">,</span> <span class="mf">0.9</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.1</span><span class="p">,</span> <span class="mf">1.1</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span> <span class="o">=</span> <span class="n">stkm</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="sa">u</span><span class="s2">&quot;batches&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">centers</span>
<span class="go">array([[ 0., 0.],</span>
<span class="go"> [ 1., 1.]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.1</span><span class="p">])</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.9</span><span class="p">,</span> <span class="mf">0.9</span><span class="p">])</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">clusterWeights</span>
<span class="go">[3.0, 3.0]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">decayFactor</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">1.5</span><span class="p">,</span> <span class="mf">1.5</span><span class="p">]),</span> <span class="n">DenseVector</span><span class="p">([</span><span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span> <span class="o">=</span> <span class="n">stkm</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="sa">u</span><span class="s2">&quot;batches&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">centers</span>
<span class="go">array([[ 0.2, 0.2],</span>
<span class="go"> [ 1.5, 1.5]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">clusterWeights</span>
<span class="go">[1.0, 1.0]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">])</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stkm</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">1.5</span><span class="p">,</span> <span class="mf">1.5</span><span class="p">])</span>
<span class="go">1</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.clustering.StreamingKMeansModel.clusterWeights">
<code class="descname">clusterWeights</code><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeansModel.clusterWeights" title="Permalink to this definition"></a></dt>
<dd><p>Return the cluster weights.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.StreamingKMeansModel.update">
<code class="descname">update</code><span class="sig-paren">(</span><em>data</em>, <em>decayFactor</em>, <em>timeUnit</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#StreamingKMeansModel.update"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.StreamingKMeansModel.update" title="Permalink to this definition"></a></dt>
<dd><p>Update the centroids, according to data</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – RDD with new data for the model update.</li>
<li><strong>decayFactor</strong> – Forgetfulness of the previous centroids.</li>
<li><strong>timeUnit</strong> – Can be “batches” or “points”. If points, then the decay factor
is raised to the power of number of new points and if batches,
then decay factor will be used as is.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.LDA">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">LDA</code><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDA"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDA" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.LDA.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>rdd</em>, <em>k=10</em>, <em>maxIterations=20</em>, <em>docConcentration=-1.0</em>, <em>topicConcentration=-1.0</em>, <em>seed=None</em>, <em>checkpointInterval=10</em>, <em>optimizer='em'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDA.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDA.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a LDA model.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rdd</strong> – RDD of documents, which are tuples of document IDs and term
(word) count vectors. The term count vectors are “bags of
words” with a fixed-size vocabulary (where the vocabulary size
is the length of the vector). Document IDs must be unique
and &gt;= 0.</li>
<li><strong>k</strong> – Number of topics to infer, i.e., the number of soft cluster
centers.
(default: 10)</li>
<li><strong>maxIterations</strong> – Maximum number of iterations allowed.
(default: 20)</li>
<li><strong>docConcentration</strong> – Concentration parameter (commonly named “alpha”) for the prior
placed on documents’ distributions over topics (“theta”).
(default: -1.0)</li>
<li><strong>topicConcentration</strong> – Concentration parameter (commonly named “beta” or “eta”) for
the prior placed on topics’ distributions over terms.
(default: -1.0)</li>
<li><strong>seed</strong> – Random seed for cluster initialization. Set as None to generate
seed based on system time.
(default: None)</li>
<li><strong>checkpointInterval</strong> – Period (in iterations) between checkpoints.
(default: 10)</li>
<li><strong>optimizer</strong> – LDAOptimizer used to perform the actual calculation. Currently
“em”, “online” are supported.
(default: “em”)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.clustering.LDAModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.clustering.</code><code class="descname">LDAModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDAModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDAModel" title="Permalink to this definition"></a></dt>
<dd><p>A clustering model derived from the LDA method.</p>
<p>Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
Terminology
- “word” = “term”: an element of the vocabulary
- “token”: instance of a term appearing in a document
- “topic”: multinomial distribution over words representing some concept
References:
- Original LDA paper (journal version):
Blei, Ng, and Jordan. “Latent Dirichlet Allocation.” JMLR, 2003.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy.testing</span> <span class="k">import</span> <span class="n">assert_almost_equal</span><span class="p">,</span> <span class="n">assert_equal</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})],</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">LDA</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">describeTopics</span><span class="p">()</span>
<span class="go">[([1, 0], [0.5..., 0.49...]), ([0, 1], [0.5..., 0.49...])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">describeTopics</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">[([1], [0.5...]), ([0], [0.5...])]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">topics</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">topicsMatrix</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">topics_expect</span> <span class="o">=</span> <span class="n">array</span><span class="p">([[</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">assert_almost_equal</span><span class="p">(</span><span class="n">topics</span><span class="p">,</span> <span class="n">topics_expect</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">LDAModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">assert_equal</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">topicsMatrix</span><span class="p">(),</span> <span class="n">model</span><span class="o">.</span><span class="n">topicsMatrix</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">()</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.clustering.LDAModel.describeTopics">
<code class="descname">describeTopics</code><span class="sig-paren">(</span><em>maxTermsPerTopic=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDAModel.describeTopics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDAModel.describeTopics" title="Permalink to this definition"></a></dt>
<dd><p>Return the topics described by weighted terms.</p>
<p>WARNING: If vocabSize and k are large, this can return a large object!</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>maxTermsPerTopic</strong> – Maximum number of terms to collect for each topic.
(default: vocabulary size)</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Array over topics. Each topic is represented as a pair of
matching arrays: (term indices, term weights in topic).
Each topic’s terms are sorted in order of decreasing weight.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.clustering.LDAModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDAModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDAModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load the LDAModel from disk.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>sc</strong> – SparkContext.</li>
<li><strong>path</strong> – Path to where the model is stored.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.LDAModel.topicsMatrix">
<code class="descname">topicsMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDAModel.topicsMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDAModel.topicsMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Inferred topics, where each topic is represented by a distribution over terms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.clustering.LDAModel.vocabSize">
<code class="descname">vocabSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/clustering.html#LDAModel.vocabSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.clustering.LDAModel.vocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Vocabulary size (number of terms or terms in the vocabulary)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.evaluation">
<span id="pyspark-mllib-evaluation-module"></span><h2>pyspark.mllib.evaluation module<a class="headerlink" href="#module-pyspark.mllib.evaluation" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.evaluation.BinaryClassificationMetrics">
<em class="property">class </em><code class="descclassname">pyspark.mllib.evaluation.</code><code class="descname">BinaryClassificationMetrics</code><span class="sig-paren">(</span><em>scoreAndLabels</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#BinaryClassificationMetrics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.BinaryClassificationMetrics" title="Permalink to this definition"></a></dt>
<dd><p>Evaluator for binary classification.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>scoreAndLabels</strong> – an RDD of (score, label) pairs</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.4</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.8</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)],</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span> <span class="o">=</span> <span class="n">BinaryClassificationMetrics</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderROC</span>
<span class="go">0.70...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderPR</span>
<span class="go">0.83...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">unpersist</span><span class="p">()</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.BinaryClassificationMetrics.areaUnderPR">
<code class="descname">areaUnderPR</code><a class="headerlink" href="#pyspark.mllib.evaluation.BinaryClassificationMetrics.areaUnderPR" title="Permalink to this definition"></a></dt>
<dd><p>Computes the area under the precision-recall curve.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.BinaryClassificationMetrics.areaUnderROC">
<code class="descname">areaUnderROC</code><a class="headerlink" href="#pyspark.mllib.evaluation.BinaryClassificationMetrics.areaUnderROC" title="Permalink to this definition"></a></dt>
<dd><p>Computes the area under the receiver operating characteristic
(ROC) curve.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.BinaryClassificationMetrics.unpersist">
<code class="descname">unpersist</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#BinaryClassificationMetrics.unpersist"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.BinaryClassificationMetrics.unpersist" title="Permalink to this definition"></a></dt>
<dd><p>Unpersists intermediate RDDs used in the computation.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.evaluation.RegressionMetrics">
<em class="property">class </em><code class="descclassname">pyspark.mllib.evaluation.</code><code class="descname">RegressionMetrics</code><span class="sig-paren">(</span><em>predictionAndObservations</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#RegressionMetrics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics" title="Permalink to this definition"></a></dt>
<dd><p>Evaluator for regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>predictionAndObservations</strong> – an RDD of (prediction,
observation) pairs.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">predictionAndObservations</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">2.5</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.5</span><span class="p">),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span> <span class="o">=</span> <span class="n">RegressionMetrics</span><span class="p">(</span><span class="n">predictionAndObservations</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">explainedVariance</span>
<span class="go">8.859...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">meanAbsoluteError</span>
<span class="go">0.5...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">meanSquaredError</span>
<span class="go">0.37...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rootMeanSquaredError</span>
<span class="go">0.61...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">r2</span>
<span class="go">0.94...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RegressionMetrics.explainedVariance">
<code class="descname">explainedVariance</code><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics.explainedVariance" title="Permalink to this definition"></a></dt>
<dd><p>Returns the explained variance regression score.
explainedVariance = 1 - variance(y - hat{y}) / variance(y)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RegressionMetrics.meanAbsoluteError">
<code class="descname">meanAbsoluteError</code><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics.meanAbsoluteError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean absolute error, which is a risk function corresponding to the
expected value of the absolute error loss or l1-norm loss.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RegressionMetrics.meanSquaredError">
<code class="descname">meanSquaredError</code><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics.meanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean squared error, which is a risk function corresponding to the
expected value of the squared error loss or quadratic loss.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RegressionMetrics.r2">
<code class="descname">r2</code><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics.r2" title="Permalink to this definition"></a></dt>
<dd><p>Returns R^2^, the coefficient of determination.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError">
<code class="descname">rootMeanSquaredError</code><a class="headerlink" href="#pyspark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the root mean squared error, which is defined as the square root of
the mean squared error.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics">
<em class="property">class </em><code class="descclassname">pyspark.mllib.evaluation.</code><code class="descname">MulticlassMetrics</code><span class="sig-paren">(</span><em>predictionAndLabels</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics" title="Permalink to this definition"></a></dt>
<dd><p>Evaluator for multiclass classification.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>predictionAndLabels</strong> – an RDD of (prediction, label) pairs.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span> <span class="o">=</span> <span class="n">MulticlassMetrics</span><span class="p">(</span><span class="n">predictionAndLabels</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">confusionMatrix</span><span class="p">()</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span>
<span class="go">array([[ 2., 1., 1.],</span>
<span class="go"> [ 1., 3., 0.],</span>
<span class="go"> [ 0., 0., 1.]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">falsePositiveRate</span><span class="p">(</span><span class="mf">0.0</span><span class="p">)</span>
<span class="go">0.2...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span>
<span class="go">0.75...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="p">(</span><span class="mf">2.0</span><span class="p">)</span>
<span class="go">1.0...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">fMeasure</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="go">0.52...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">accuracy</span>
<span class="go">0.66...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">weightedFalsePositiveRate</span>
<span class="go">0.19...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">weightedPrecision</span>
<span class="go">0.68...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">weightedRecall</span>
<span class="go">0.66...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">weightedFMeasure</span><span class="p">()</span>
<span class="go">0.66...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">weightedFMeasure</span><span class="p">(</span><span class="mf">2.0</span><span class="p">)</span>
<span class="go">0.65...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.accuracy">
<code class="descname">accuracy</code><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.accuracy" title="Permalink to this definition"></a></dt>
<dd><p>Returns accuracy (equals to the total number of correctly classified instances
out of the total number of instances).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.confusionMatrix">
<code class="descname">confusionMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.confusionMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.confusionMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Returns confusion matrix: predicted classes are in columns,
they are ordered by class label ascending, as in “labels”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.fMeasure">
<code class="descname">fMeasure</code><span class="sig-paren">(</span><em>label=None</em>, <em>beta=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.fMeasure"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.fMeasure" title="Permalink to this definition"></a></dt>
<dd><p>Returns f-measure or f-measure for a given label (category) if specified.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.falsePositiveRate">
<code class="descname">falsePositiveRate</code><span class="sig-paren">(</span><em>label</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.falsePositiveRate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.falsePositiveRate" title="Permalink to this definition"></a></dt>
<dd><p>Returns false positive rate for a given label (category).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.precision">
<code class="descname">precision</code><span class="sig-paren">(</span><em>label=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.precision"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.precision" title="Permalink to this definition"></a></dt>
<dd><p>Returns precision or precision for a given label (category) if specified.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.recall">
<code class="descname">recall</code><span class="sig-paren">(</span><em>label=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.recall"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.recall" title="Permalink to this definition"></a></dt>
<dd><p>Returns recall or recall for a given label (category) if specified.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.truePositiveRate">
<code class="descname">truePositiveRate</code><span class="sig-paren">(</span><em>label</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.truePositiveRate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.truePositiveRate" title="Permalink to this definition"></a></dt>
<dd><p>Returns true positive rate for a given label (category).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.weightedFMeasure">
<code class="descname">weightedFMeasure</code><span class="sig-paren">(</span><em>beta=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#MulticlassMetrics.weightedFMeasure"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.weightedFMeasure" title="Permalink to this definition"></a></dt>
<dd><p>Returns weighted averaged f-measure.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.weightedFalsePositiveRate">
<code class="descname">weightedFalsePositiveRate</code><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.weightedFalsePositiveRate" title="Permalink to this definition"></a></dt>
<dd><p>Returns weighted false positive rate.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.weightedPrecision">
<code class="descname">weightedPrecision</code><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.weightedPrecision" title="Permalink to this definition"></a></dt>
<dd><p>Returns weighted averaged precision.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.weightedRecall">
<code class="descname">weightedRecall</code><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.weightedRecall" title="Permalink to this definition"></a></dt>
<dd><p>Returns weighted averaged recall.
(equals to precision, recall and f-measure)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.MulticlassMetrics.weightedTruePositiveRate">
<code class="descname">weightedTruePositiveRate</code><a class="headerlink" href="#pyspark.mllib.evaluation.MulticlassMetrics.weightedTruePositiveRate" title="Permalink to this definition"></a></dt>
<dd><p>Returns weighted true positive rate.
(equals to precision, recall and f-measure)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.evaluation.RankingMetrics">
<em class="property">class </em><code class="descclassname">pyspark.mllib.evaluation.</code><code class="descname">RankingMetrics</code><span class="sig-paren">(</span><em>predictionAndLabels</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#RankingMetrics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.RankingMetrics" title="Permalink to this definition"></a></dt>
<dd><p>Evaluator for ranking algorithms.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>predictionAndLabels</strong> – an RDD of (predicted ranking,
ground truth set) pairs.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">]),</span>
<span class="gp">... </span> <span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span> <span class="o">=</span> <span class="n">RankingMetrics</span><span class="p">(</span><span class="n">predictionAndLabels</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">precisionAt</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">0.33...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">precisionAt</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="go">0.26...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">precisionAt</span><span class="p">(</span><span class="mi">15</span><span class="p">)</span>
<span class="go">0.17...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">meanAveragePrecision</span>
<span class="go">0.35...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">ndcgAt</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="go">0.33...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">ndcgAt</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="go">0.48...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision">
<code class="descname">meanAveragePrecision</code><a class="headerlink" href="#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean average precision (MAP) of all the queries.
If a query has an empty ground truth set, the average precision will be zero and
a log warining is generated.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.RankingMetrics.ndcgAt">
<code class="descname">ndcgAt</code><span class="sig-paren">(</span><em>k</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#RankingMetrics.ndcgAt"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.RankingMetrics.ndcgAt" title="Permalink to this definition"></a></dt>
<dd><p>Compute the average NDCG value of all the queries, truncated at ranking position k.
The discounted cumulative gain at position k is computed as:
sum,,i=1,,^k^ (2^{relevance of ‘’i’‘th item}^ - 1) / log(i + 1),
and the NDCG is obtained by dividing the DCG value on the ground truth set.
In the current implementation, the relevance value is binary.
If a query has an empty ground truth set, zero will be used as NDCG together with
a log warning.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.evaluation.RankingMetrics.precisionAt">
<code class="descname">precisionAt</code><span class="sig-paren">(</span><em>k</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/evaluation.html#RankingMetrics.precisionAt"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.evaluation.RankingMetrics.precisionAt" title="Permalink to this definition"></a></dt>
<dd><p>Compute the average precision of all the queries, truncated at ranking position k.</p>
<p>If for a query, the ranking algorithm returns n (n &lt; k) results, the precision value
will be computed as #(relevant items retrieved) / k. This formula also applies when
the size of the ground truth set is less than k.</p>
<p>If a query has an empty ground truth set, zero will be used as precision together
with a log warning.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.feature">
<span id="pyspark-mllib-feature-module"></span><h2>pyspark.mllib.feature module<a class="headerlink" href="#module-pyspark.mllib.feature" title="Permalink to this headline"></a></h2>
<p>Python package for feature in MLlib.</p>
<dl class="class">
<dt id="pyspark.mllib.feature.Normalizer">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">Normalizer</code><span class="sig-paren">(</span><em>p=2.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Normalizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Normalizer" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.VectorTransformer</span></code></p>
<p>Normalizes samples individually to unit L<sup>p</sup> norm</p>
<p>For any 1 &lt;= <cite>p</cite> &lt; float(‘inf’), normalizes samples using
sum(abs(vector) <sup>p</sup>) <sup>(1/p)</sup> as norm.</p>
<p>For <cite>p</cite> = float(‘inf’), max(abs(vector)) will be used as norm for
normalization.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>p</strong> – Normalization in L^p^ space, p = 2 by default.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nor</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nor</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">v</span><span class="p">)</span>
<span class="go">DenseVector([0.0, 0.3333, 0.6667])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">v</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nor</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([0.0, 0.3333, 0.6667])]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">nor2</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;inf&quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nor2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">v</span><span class="p">)</span>
<span class="go">DenseVector([0.0, 0.5, 1.0])</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.Normalizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Normalizer.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Normalizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Applies unit length normalization on a vector.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>vector</strong> – vector or RDD of vector to be normalized.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">normalized vector. If the norm of the input is zero, it
will return the input vector.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.StandardScalerModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">StandardScalerModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScalerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.JavaVectorTransformer</span></code></p>
<p>Represents a StandardScaler model that can transform vectors.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.feature.StandardScalerModel.mean">
<code class="descname">mean</code><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.mean" title="Permalink to this definition"></a></dt>
<dd><p>Return the column mean values.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.StandardScalerModel.setWithMean">
<code class="descname">setWithMean</code><span class="sig-paren">(</span><em>withMean</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScalerModel.setWithMean"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.setWithMean" title="Permalink to this definition"></a></dt>
<dd><p>Setter of the boolean which decides
whether it uses mean or not</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.StandardScalerModel.setWithStd">
<code class="descname">setWithStd</code><span class="sig-paren">(</span><em>withStd</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScalerModel.setWithStd"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.setWithStd" title="Permalink to this definition"></a></dt>
<dd><p>Setter of the boolean which decides
whether it uses std or not</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.feature.StandardScalerModel.std">
<code class="descname">std</code><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.std" title="Permalink to this definition"></a></dt>
<dd><p>Return the column standard deviation values.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.StandardScalerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScalerModel.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Applies standardization transformation on a vector.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">In Python, transform cannot currently be used within
an RDD transformation or action.
Call transform directly on the RDD instead.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>vector</strong> – Vector or RDD of Vector to be standardized.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Standardized vector. If the variance of a column is
zero, it will return default <cite>0.0</cite> for the column with
zero variance.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.feature.StandardScalerModel.withMean">
<code class="descname">withMean</code><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.withMean" title="Permalink to this definition"></a></dt>
<dd><p>Returns if the model centers the data before scaling.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.feature.StandardScalerModel.withStd">
<code class="descname">withStd</code><a class="headerlink" href="#pyspark.mllib.feature.StandardScalerModel.withStd" title="Permalink to this definition"></a></dt>
<dd><p>Returns if the model scales the data to unit standard deviation.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.StandardScaler">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">StandardScaler</code><span class="sig-paren">(</span><em>withMean=False</em>, <em>withStd=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScaler"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScaler" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Standardizes features by removing the mean and scaling to unit
variance using column summary statistics on the samples in the
training set.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>withMean</strong> – False by default. Centers the data with mean
before scaling. It will build a dense output, so take
care when applying to sparse input.</li>
<li><strong>withStd</strong> – True by default. Scales the data to unit
standard deviation.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">vs</span> <span class="o">=</span> <span class="p">[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.3</span><span class="p">,</span> <span class="mi">0</span><span class="p">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.8</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.9</span><span class="p">])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">vs</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">standardizer</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">standardizer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span> <span class="n">r</span>
<span class="go">DenseVector([-0.7071, 0.7071, -0.7071])</span>
<span class="go">DenseVector([0.7071, -0.7071, 0.7071])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">int</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">std</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">int</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">mean</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="mi">10</span><span class="p">)</span>
<span class="go">9</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">withStd</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">withMean</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.StandardScaler.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#StandardScaler.fit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.StandardScaler.fit" title="Permalink to this definition"></a></dt>
<dd><p>Computes the mean and variance and stores as a model to be used
for later scaling.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dataset</strong> – The data used to compute the mean and variance
to build the transformation model.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a StandardScalarModel</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.HashingTF">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">HashingTF</code><span class="sig-paren">(</span><em>numFeatures=1048576</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#HashingTF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.HashingTF" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Maps a sequence of terms to their term frequencies using the hashing
trick.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">The terms must be hashable (can not be dict/set/list…).</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>numFeatures</strong> – number of features (default: 2^20)</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">htf</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">doc</span> <span class="o">=</span> <span class="s2">&quot;a a b b c d&quot;</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot; &quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">htf</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
<span class="go">SparseVector(100, {...})</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.HashingTF.indexOf">
<code class="descname">indexOf</code><span class="sig-paren">(</span><em>term</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#HashingTF.indexOf"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.HashingTF.indexOf" title="Permalink to this definition"></a></dt>
<dd><p>Returns the index of the input term.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.HashingTF.setBinary">
<code class="descname">setBinary</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#HashingTF.setBinary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.HashingTF.setBinary" title="Permalink to this definition"></a></dt>
<dd><p>If True, term frequency vector will be binary such that non-zero
term counts will be set to 1
(default: False)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.HashingTF.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>document</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#HashingTF.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.HashingTF.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input document (list of terms) to term frequency
vectors, or transform the RDD of document to RDD of term
frequency vectors.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.IDFModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">IDFModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#IDFModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.IDFModel" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.JavaVectorTransformer</span></code></p>
<p>Represents an IDF model that can transform term frequency vectors.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.IDFModel.idf">
<code class="descname">idf</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#IDFModel.idf"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.IDFModel.idf" title="Permalink to this definition"></a></dt>
<dd><p>Returns the current IDF vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.IDFModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#IDFModel.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.IDFModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms term frequency (TF) vectors to TF-IDF vectors.</p>
<p>If <cite>minDocFreq</cite> was set for the IDF calculation,
the terms which occur in fewer than <cite>minDocFreq</cite>
documents will have an entry of 0.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">In Python, transform cannot currently be used within
an RDD transformation or action.
Call transform directly on the RDD instead.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – an RDD of term frequency vectors or a term frequency
vector</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">an RDD of TF-IDF vectors or a TF-IDF vector</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.IDF">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">IDF</code><span class="sig-paren">(</span><em>minDocFreq=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#IDF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.IDF" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Inverse document frequency (IDF).</p>
<p>The standard formulation is used: <cite>idf = log((m + 1) / (d(t) + 1))</cite>,
where <cite>m</cite> is the total number of documents and <cite>d(t)</cite> is the number
of documents that contain term <cite>t</cite>.</p>
<p>This implementation supports filtering out terms which do not appear
in a minimum number of documents (controlled by the variable
<cite>minDocFreq</cite>). For terms that are not in at least <cite>minDocFreq</cite>
documents, the IDF is found as 0, resulting in TF-IDFs of 0.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>minDocFreq</strong> – minimum of documents in which a term
should appear for filtering</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">n</span> <span class="o">=</span> <span class="mi">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">freqs</span> <span class="o">=</span> <span class="p">[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">freqs</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tfidf</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">tfidf</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span> <span class="n">r</span>
<span class="go">SparseVector(4, {1: 0.0, 3: 0.5754})</span>
<span class="go">DenseVector([0.0, 0.0, 1.3863, 0.863])</span>
<span class="go">SparseVector(4, {1: 0.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]))</span>
<span class="go">DenseVector([0.0, 0.0, 1.3863, 0.863])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">])</span>
<span class="go">DenseVector([0.0, 0.0, 1.3863, 0.863])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)))</span>
<span class="go">SparseVector(4, {1: 0.0, 3: 0.5754})</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.IDF.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#IDF.fit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.IDF.fit" title="Permalink to this definition"></a></dt>
<dd><p>Computes the inverse document frequency.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dataset</strong> – an RDD of term frequency vectors</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.Word2Vec">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">Word2Vec</code><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Word2Vec creates vector representation of words in a text corpus.
The algorithm first constructs a vocabulary from the corpus
and then learns vector representation of words in the vocabulary.
The vector representation can be used as features in
natural language processing and machine learning algorithms.</p>
<p>We used skip-gram model in our implementation and hierarchical
softmax method to train the model. The variable names in the
implementation matches the original C implementation.</p>
<p>For original C implementation,
see <a class="reference external" href="https://code.google.com/p/word2vec/">https://code.google.com/p/word2vec/</a>
For research papers, see
Efficient Estimation of Word Representations in Vector Space
and Distributed Representations of Words and Phrases and their
Compositionality.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sentence</span> <span class="o">=</span> <span class="s2">&quot;a b &quot;</span> <span class="o">*</span> <span class="mi">100</span> <span class="o">+</span> <span class="s2">&quot;a c &quot;</span> <span class="o">*</span> <span class="mi">10</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">localDoc</span> <span class="o">=</span> <span class="p">[</span><span class="n">sentence</span><span class="p">,</span> <span class="n">sentence</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">doc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">localDoc</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">line</span><span class="p">:</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot; &quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">()</span><span class="o">.</span><span class="n">setVectorSize</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">setSeed</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
</pre></div>
</div>
<p>Querying for synonyms of a word will not return that word:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">syms</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">[</span><span class="n">s</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">syms</span><span class="p">]</span>
<span class="go">[&#39;b&#39;, &#39;c&#39;]</span>
</pre></div>
</div>
<p>But querying for synonyms of a vector may return the word whose
representation is that vector:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">vec</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">syms</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="n">vec</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">[</span><span class="n">s</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">syms</span><span class="p">]</span>
<span class="go">[&#39;a&#39;, &#39;b&#39;]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">Word2VecModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="n">sameModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">syms</span> <span class="o">=</span> <span class="n">sameModel</span><span class="o">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">[</span><span class="n">s</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">syms</span><span class="p">]</span>
<span class="go">[&#39;b&#39;, &#39;c&#39;]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>data</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.fit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.fit" title="Permalink to this definition"></a></dt>
<dd><p>Computes the vector representation of each word in vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>data</strong> – training data. RDD of list of string</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Word2VecModel instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setLearningRate">
<code class="descname">setLearningRate</code><span class="sig-paren">(</span><em>learningRate</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setLearningRate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setLearningRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets initial learning rate (default: 0.025).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setMinCount">
<code class="descname">setMinCount</code><span class="sig-paren">(</span><em>minCount</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setMinCount"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setMinCount" title="Permalink to this definition"></a></dt>
<dd><p>Sets minCount, the minimum number of times a token must appear
to be included in the word2vec model’s vocabulary (default: 5).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setNumIterations">
<code class="descname">setNumIterations</code><span class="sig-paren">(</span><em>numIterations</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setNumIterations"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setNumIterations" title="Permalink to this definition"></a></dt>
<dd><p>Sets number of iterations (default: 1), which should be smaller
than or equal to number of partitions.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setNumPartitions">
<code class="descname">setNumPartitions</code><span class="sig-paren">(</span><em>numPartitions</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setNumPartitions"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setNumPartitions" title="Permalink to this definition"></a></dt>
<dd><p>Sets number of partitions (default: 1). Use a small number for
accuracy.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>seed</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setSeed"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets random seed.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setVectorSize">
<code class="descname">setVectorSize</code><span class="sig-paren">(</span><em>vectorSize</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setVectorSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setVectorSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets vector size (default: 100).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2Vec.setWindowSize">
<code class="descname">setWindowSize</code><span class="sig-paren">(</span><em>windowSize</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2Vec.setWindowSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2Vec.setWindowSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets window size (default: 5).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.Word2VecModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">Word2VecModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2VecModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2VecModel" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.JavaVectorTransformer</span></code>, <a class="reference internal" href="#pyspark.mllib.util.JavaSaveable" title="pyspark.mllib.util.JavaSaveable"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.util.JavaSaveable</span></code></a>, <a class="reference internal" href="#pyspark.mllib.util.JavaLoader" title="pyspark.mllib.util.JavaLoader"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.util.JavaLoader</span></code></a></p>
<p>class for Word2Vec model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2VecModel.findSynonyms">
<code class="descname">findSynonyms</code><span class="sig-paren">(</span><em>word</em>, <em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2VecModel.findSynonyms"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2VecModel.findSynonyms" title="Permalink to this definition"></a></dt>
<dd><p>Find synonyms of a word</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>word</strong> – a word or a vector representation of word</li>
<li><strong>num</strong> – number of synonyms to find</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">array of (word, cosineSimilarity)</p>
</td>
</tr>
</tbody>
</table>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Local use only</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2VecModel.getVectors">
<code class="descname">getVectors</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2VecModel.getVectors"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2VecModel.getVectors" title="Permalink to this definition"></a></dt>
<dd><p>Returns a map of words to their vector representations.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.feature.Word2VecModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2VecModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2VecModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.Word2VecModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>word</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#Word2VecModel.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.Word2VecModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms a word to its vector representation</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Local use only</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>word</strong> – a word</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">vector representation of word(s)</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.ChiSqSelector">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">ChiSqSelector</code><span class="sig-paren">(</span><em>numTopFeatures=50</em>, <em>selectorType='numTopFeatures'</em>, <em>percentile=0.1</em>, <em>fpr=0.05</em>, <em>fdr=0.05</em>, <em>fwe=0.05</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Creates a ChiSquared feature selector.
The selector supports different selection methods: <cite>numTopFeatures</cite>, <cite>percentile</cite>, <cite>fpr</cite>,
<cite>fdr</cite>, <cite>fwe</cite>.</p>
<blockquote>
<div><ul class="simple">
<li><cite>numTopFeatures</cite> chooses a fixed number of top features according to a chi-squared test.</li>
<li><cite>percentile</cite> is similar but chooses a fraction of all features
instead of a fixed number.</li>
<li><cite>fpr</cite> chooses all features whose p-values are below a threshold,
thus controlling the false positive rate of selection.</li>
<li><cite>fdr</cite> uses the <a class="reference external" href="https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a>
to choose all features whose false discovery rate is below a threshold.</li>
<li><cite>fwe</cite> chooses all features whose p-values are below a threshold. The threshold is scaled by
1/numFeatures, thus controlling the family-wise error rate of selection.</li>
</ul>
</div></blockquote>
<p>By default, the selection method is <cite>numTopFeatures</cite>, with the default number of top features
set to 50.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">8.0</span><span class="p">,</span> <span class="mi">1</span><span class="p">:</span> <span class="mf">7.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mi">2</span><span class="p">:</span> <span class="mf">6.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">7.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mi">2</span><span class="p">:</span> <span class="mf">6.0</span><span class="p">}))</span>
<span class="go">SparseVector(1, {})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">7.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]))</span>
<span class="go">DenseVector([7.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="p">(</span><span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;fpr&quot;</span><span class="p">,</span> <span class="n">fpr</span><span class="o">=</span><span class="mf">0.2</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mi">2</span><span class="p">:</span> <span class="mf">6.0</span><span class="p">}))</span>
<span class="go">SparseVector(1, {})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">7.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]))</span>
<span class="go">DenseVector([7.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="p">(</span><span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;percentile&quot;</span><span class="p">,</span> <span class="n">percentile</span><span class="o">=</span><span class="mf">0.34</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">7.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]))</span>
<span class="go">DenseVector([7.0])</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>data</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.fit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.fit" title="Permalink to this definition"></a></dt>
<dd><p>Returns a ChiSquared feature selector.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>data</strong> – an <cite>RDD[LabeledPoint]</cite> containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
Apply feature discretizer before using this function.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setFdr">
<code class="descname">setFdr</code><span class="sig-paren">(</span><em>fdr</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setFdr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setFdr" title="Permalink to this definition"></a></dt>
<dd><p>set FDR [0.0, 1.0] for feature selection by FDR.
Only applicable when selectorType = “fdr”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setFpr">
<code class="descname">setFpr</code><span class="sig-paren">(</span><em>fpr</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setFpr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setFpr" title="Permalink to this definition"></a></dt>
<dd><p>set FPR [0.0, 1.0] for feature selection by FPR.
Only applicable when selectorType = “fpr”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setFwe">
<code class="descname">setFwe</code><span class="sig-paren">(</span><em>fwe</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setFwe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setFwe" title="Permalink to this definition"></a></dt>
<dd><p>set FWE [0.0, 1.0] for feature selection by FWE.
Only applicable when selectorType = “fwe”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setNumTopFeatures">
<code class="descname">setNumTopFeatures</code><span class="sig-paren">(</span><em>numTopFeatures</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setNumTopFeatures"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setNumTopFeatures" title="Permalink to this definition"></a></dt>
<dd><p>set numTopFeature for feature selection by number of top features.
Only applicable when selectorType = “numTopFeatures”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setPercentile">
<code class="descname">setPercentile</code><span class="sig-paren">(</span><em>percentile</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setPercentile"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setPercentile" title="Permalink to this definition"></a></dt>
<dd><p>set percentile [0.0, 1.0] for feature selection by percentile.
Only applicable when selectorType = “percentile”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelector.setSelectorType">
<code class="descname">setSelectorType</code><span class="sig-paren">(</span><em>selectorType</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelector.setSelectorType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelector.setSelectorType" title="Permalink to this definition"></a></dt>
<dd><p>set the selector type of the ChisqSelector.
Supported options: “numTopFeatures” (default), “percentile”, “fpr”, “fdr”, “fwe”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.ChiSqSelectorModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">ChiSqSelectorModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelectorModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelectorModel" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.JavaVectorTransformer</span></code></p>
<p>Represents a Chi Squared selector model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.ChiSqSelectorModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ChiSqSelectorModel.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ChiSqSelectorModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Applies transformation on a vector.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>vector</strong> – Vector or RDD of Vector to be transformed.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">transformed vector.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.feature.ElementwiseProduct">
<em class="property">class </em><code class="descclassname">pyspark.mllib.feature.</code><code class="descname">ElementwiseProduct</code><span class="sig-paren">(</span><em>scalingVector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ElementwiseProduct"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ElementwiseProduct" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.feature.VectorTransformer</span></code></p>
<p>Scales each column of the vector, with the supplied weight vector.
i.e the elementwise product.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">weight</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">eprod</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">weight</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">eprod</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">DenseVector([2.0, 2.0, 9.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">9.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">eprod</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.feature.ElementwiseProduct.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/feature.html#ElementwiseProduct.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.feature.ElementwiseProduct.transform" title="Permalink to this definition"></a></dt>
<dd><p>Computes the Hadamard product of the vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.fpm">
<span id="pyspark-mllib-fpm-module"></span><h2>pyspark.mllib.fpm module<a class="headerlink" href="#module-pyspark.mllib.fpm" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.fpm.FPGrowth">
<em class="property">class </em><code class="descclassname">pyspark.mllib.fpm.</code><code class="descname">FPGrowth</code><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowth"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowth" title="Permalink to this definition"></a></dt>
<dd><p>A Parallel FP-growth algorithm to mine frequent itemsets.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="class">
<dt id="pyspark.mllib.fpm.FPGrowth.FreqItemset">
<em class="property">class </em><code class="descname">FreqItemset</code><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowth.FreqItemset"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowth.FreqItemset" title="Permalink to this definition"></a></dt>
<dd><p>Represents an (items, freq) tuple.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.fpm.FPGrowth.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>minSupport=0.3</em>, <em>numPartitions=-1</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowth.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowth.train" title="Permalink to this definition"></a></dt>
<dd><p>Computes an FP-Growth model that contains frequent itemsets.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The input data set, each element contains a transaction.</li>
<li><strong>minSupport</strong> – The minimal support level.
(default: 0.3)</li>
<li><strong>numPartitions</strong> – The number of partitions used by parallel FP-growth. A value
of -1 will use the same number as input data.
(default: -1)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.fpm.FPGrowthModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.fpm.</code><code class="descname">FPGrowthModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowthModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowthModel" title="Permalink to this definition"></a></dt>
<dd><p>A FP-Growth model for mining frequent itemsets
using the Parallel FP-Growth algorithm.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;f&quot;</span><span class="p">]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">FPGrowth</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="mf">0.6</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">freqItemsets</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="go">[FreqItemset(items=[&#39;a&#39;], freq=4), FreqItemset(items=[&#39;c&#39;], freq=3), ...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/fpm&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">FPGrowthModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">freqItemsets</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span> <span class="o">==</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">freqItemsets</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.fpm.FPGrowthModel.freqItemsets">
<code class="descname">freqItemsets</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowthModel.freqItemsets"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowthModel.freqItemsets" title="Permalink to this definition"></a></dt>
<dd><p>Returns the frequent itemsets of this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.fpm.FPGrowthModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#FPGrowthModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.FPGrowthModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.fpm.PrefixSpan">
<em class="property">class </em><code class="descclassname">pyspark.mllib.fpm.</code><code class="descname">PrefixSpan</code><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#PrefixSpan"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.PrefixSpan" title="Permalink to this definition"></a></dt>
<dd><p>A parallel PrefixSpan algorithm to mine frequent sequential patterns.
The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan:
Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth
([[<a class="reference external" href="http://doi.org/10.1109/ICDE.2001.914830">http://doi.org/10.1109/ICDE.2001.914830</a>]]).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="class">
<dt id="pyspark.mllib.fpm.PrefixSpan.FreqSequence">
<em class="property">class </em><code class="descname">FreqSequence</code><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#PrefixSpan.FreqSequence"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.PrefixSpan.FreqSequence" title="Permalink to this definition"></a></dt>
<dd><p>Represents a (sequence, freq) tuple.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.fpm.PrefixSpan.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>minSupport=0.1</em>, <em>maxPatternLength=10</em>, <em>maxLocalProjDBSize=32000000</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#PrefixSpan.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.PrefixSpan.train" title="Permalink to this definition"></a></dt>
<dd><p>Finds the complete set of frequent sequential patterns in the
input sequences of itemsets.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The input data set, each element contains a sequence of
itemsets.</li>
<li><strong>minSupport</strong> – The minimal support level of the sequential pattern, any
pattern that appears more than (minSupport *
size-of-the-dataset) times will be output.
(default: 0.1)</li>
<li><strong>maxPatternLength</strong> – The maximal length of the sequential pattern, any pattern
that appears less than maxPatternLength will be output.
(default: 10)</li>
<li><strong>maxLocalProjDBSize</strong> – The maximum number of items (including delimiters used in the
internal storage format) allowed in a projected database before
local processing. If a projected database exceeds this size,
another iteration of distributed prefix growth is run.
(default: 32000000)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.fpm.PrefixSpanModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.fpm.</code><code class="descname">PrefixSpanModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#PrefixSpanModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.PrefixSpanModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by PrefixSpan</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="p">[[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;c&quot;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="p">[[</span><span class="s2">&quot;a&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="p">[[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;e&quot;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="p">[[</span><span class="s2">&quot;f&quot;</span><span class="p">]]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">PrefixSpan</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">freqSequences</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="go">[FreqSequence(sequence=[[&#39;a&#39;]], freq=3), FreqSequence(sequence=[[&#39;a&#39;], [&#39;a&#39;]], freq=1), ...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.fpm.PrefixSpanModel.freqSequences">
<code class="descname">freqSequences</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/fpm.html#PrefixSpanModel.freqSequences"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.fpm.PrefixSpanModel.freqSequences" title="Permalink to this definition"></a></dt>
<dd><p>Gets frequent sequences</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.linalg">
<span id="pyspark-mllib-linalg-module"></span><h2>pyspark.mllib.linalg module<a class="headerlink" href="#module-pyspark.mllib.linalg" title="Permalink to this headline"></a></h2>
<p>MLlib utilities for linear algebra. For dense vectors, MLlib
uses the NumPy <code class="xref py py-class docutils literal"><span class="pre">array</span></code> type, so you can simply pass NumPy arrays
around. For sparse vectors, users can construct a <a class="reference internal" href="#pyspark.mllib.linalg.SparseVector" title="pyspark.mllib.linalg.SparseVector"><code class="xref py py-class docutils literal"><span class="pre">SparseVector</span></code></a>
object from MLlib or pass SciPy <code class="xref py py-class docutils literal"><span class="pre">scipy.sparse</span></code> column vectors if
SciPy is available in their environment.</p>
<dl class="class">
<dt id="pyspark.mllib.linalg.Vector">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">Vector</code><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vector" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<dl class="method">
<dt id="pyspark.mllib.linalg.Vector.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vector.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vector.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Vector" title="pyspark.ml.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Vector</span></code></a></td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.Vector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Convert the vector into an numpy.ndarray</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">numpy.ndarray</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.DenseVector">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">DenseVector</code><span class="sig-paren">(</span><em>ar</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.Vector" title="pyspark.mllib.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Vector</span></code></a></p>
<p>A dense vector represented by a value array. We use numpy array for
storage and arithmetics will be delegated to the underlying numpy
array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">+</span> <span class="n">u</span>
<span class="go">DenseVector([4.0, 6.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="mi">2</span> <span class="o">-</span> <span class="n">v</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">/</span> <span class="mi">2</span>
<span class="go">DenseVector([0.5, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">*</span> <span class="n">u</span>
<span class="go">DenseVector([3.0, 8.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">/</span> <span class="n">v</span>
<span class="go">DenseVector([3.0, 2.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">%</span> <span class="mi">2</span>
<span class="go">DenseVector([1.0, 0.0])</span>
</pre></div>
</div>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.DenseVector" title="pyspark.ml.linalg.DenseVector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.DenseVector</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.dot">
<code class="descname">dot</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.dot"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.dot" title="Permalink to this definition"></a></dt>
<dd><p>Compute the dot product of two Vectors. We support
(Numpy array, list, SparseVector, or SciPy sparse)
and a target NumPy array that is either 1- or 2-dimensional.
Equivalent to calling numpy.dot of the two vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">dense</span><span class="p">)</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">]))</span>
<span class="go">4.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">)))</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">([</span><span class="mf">1.</span><span class="p">,])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">reshape</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">],</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="n">order</span><span class="o">=</span><span class="s1">&#39;F&#39;</span><span class="p">))</span>
<span class="go">array([ 5., 11.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">reshape</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">],</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">order</span><span class="o">=</span><span class="s1">&#39;F&#39;</span><span class="p">))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.norm">
<code class="descname">norm</code><span class="sig-paren">(</span><em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.norm" title="Permalink to this definition"></a></dt>
<dd><p>Calculates the norm of a DenseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="o">-</span><span class="mi">3</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="go">3.7...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">6.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.numNonzeros">
<code class="descname">numNonzeros</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.numNonzeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.numNonzeros" title="Permalink to this definition"></a></dt>
<dd><p>Number of nonzero elements. This scans all active values and count non zeros</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.DenseVector.parse">
<em class="property">static </em><code class="descname">parse</code><span class="sig-paren">(</span><em>s</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.parse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.parse" title="Permalink to this definition"></a></dt>
<dd><p>Parse string representation back into the DenseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">DenseVector</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s1">&#39; [ 0.0,1.0,2.0, 3.0]&#39;</span><span class="p">)</span>
<span class="go">DenseVector([0.0, 1.0, 2.0, 3.0])</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.squared_distance">
<code class="descname">squared_distance</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance of two Vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense1</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense2</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense3</span> <span class="o">=</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense3</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse1</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">sparse1</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">([</span><span class="mf">1.</span><span class="p">,])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,],</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseVector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseVector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns an numpy.ndarray</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.DenseVector.values">
<code class="descname">values</code><a class="headerlink" href="#pyspark.mllib.linalg.DenseVector.values" title="Permalink to this definition"></a></dt>
<dd><p>Returns a list of values</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.SparseVector">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">SparseVector</code><span class="sig-paren">(</span><em>size</em>, <em>*args</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.Vector" title="pyspark.mllib.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Vector</span></code></a></p>
<p>A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy’s {scipy.sparse} data types.</p>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.SparseVector" title="pyspark.ml.linalg.SparseVector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.SparseVector</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.dot">
<code class="descname">dot</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.dot"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.dot" title="Permalink to this definition"></a></dt>
<dd><p>Dot product with a SparseVector or 1- or 2-dimensional Numpy array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">25.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">22.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">]]))</span>
<span class="go">array([ 22., 22.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.SparseVector.indices">
<code class="descname">indices</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.indices" title="Permalink to this definition"></a></dt>
<dd><p>A list of indices corresponding to active entries.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.norm">
<code class="descname">norm</code><span class="sig-paren">(</span><em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.norm" title="Permalink to this definition"></a></dt>
<dd><p>Calculates the norm of a SparseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.</span><span class="p">,</span> <span class="o">-</span><span class="mf">4.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">7.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="go">5.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.numNonzeros">
<code class="descname">numNonzeros</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.numNonzeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.numNonzeros" title="Permalink to this definition"></a></dt>
<dd><p>Number of nonzero elements. This scans all active values and count non zeros.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.SparseVector.parse">
<em class="property">static </em><code class="descname">parse</code><span class="sig-paren">(</span><em>s</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.parse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.parse" title="Permalink to this definition"></a></dt>
<dd><p>Parse string representation back into the SparseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">SparseVector</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s1">&#39; (4, [0,1 ],[ 4.0,5.0] )&#39;</span><span class="p">)</span>
<span class="go">SparseVector(4, {0: 4.0, 1: 5.0})</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.SparseVector.size">
<code class="descname">size</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.size" title="Permalink to this definition"></a></dt>
<dd><p>Size of the vector.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.squared_distance">
<code class="descname">squared_distance</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance from a SparseVector or 1-dimensional NumPy array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">11.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">11.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">26.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">26.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseVector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseVector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns a copy of this SparseVector as a 1-dimensional NumPy array.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.SparseVector.values">
<code class="descname">values</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.mllib.linalg.SparseVector.values" title="Permalink to this definition"></a></dt>
<dd><p>A list of values corresponding to active entries.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.Vectors">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">Vectors</code><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Factory methods for working with vectors.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Dense vectors are simply represented as NumPy array objects,
so there is no need to covert them for use in MLlib. For sparse vectors,
the factory methods in this class create an MLlib-compatible type, or users
can pass in SciPy’s <code class="xref py py-class docutils literal"><span class="pre">scipy.sparse</span></code> column vectors.</p>
</div>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.dense">
<em class="property">static </em><code class="descname">dense</code><span class="sig-paren">(</span><em>*elements</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.dense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.dense" title="Permalink to this definition"></a></dt>
<dd><p>Create a dense vector of 64-bit floats from a Python list or numbers.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
<span class="go">DenseVector([1.0, 2.0, 3.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="go">DenseVector([1.0, 2.0])</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.fromML">
<em class="property">static </em><code class="descname">fromML</code><span class="sig-paren">(</span><em>vec</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.fromML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.fromML" title="Permalink to this definition"></a></dt>
<dd><p>Convert a vector from the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>vec</strong> – a <a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Vector" title="pyspark.ml.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Vector</span></code></a></td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a <a class="reference internal" href="#pyspark.mllib.linalg.Vector" title="pyspark.mllib.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Vector</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.norm">
<em class="property">static </em><code class="descname">norm</code><span class="sig-paren">(</span><em>vector</em>, <em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.norm" title="Permalink to this definition"></a></dt>
<dd><p>Find norm of the given vector.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.parse">
<em class="property">static </em><code class="descname">parse</code><span class="sig-paren">(</span><em>s</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.parse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.parse" title="Permalink to this definition"></a></dt>
<dd><p>Parse a string representation back into the Vector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s1">&#39;[2,1,2 ]&#39;</span><span class="p">)</span>
<span class="go">DenseVector([2.0, 1.0, 2.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s1">&#39; ( 100, [0], [2])&#39;</span><span class="p">)</span>
<span class="go">SparseVector(100, {0: 2.0})</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.sparse">
<em class="property">static </em><code class="descname">sparse</code><span class="sig-paren">(</span><em>size</em>, <em>*args</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.sparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.sparse" title="Permalink to this definition"></a></dt>
<dd><p>Create a sparse vector, using either a dictionary, a list of
(index, value) pairs, or two separate arrays of indices and
values (sorted by index).</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>size</strong> – Size of the vector.</li>
<li><strong>args</strong> – Non-zero entries, as a dictionary, list of tuples,
or two sorted lists containing indices and values.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mi">3</span><span class="p">:</span> <span class="mf">5.5</span><span class="p">})</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">5.5</span><span class="p">)])</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">5.5</span><span class="p">])</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.squared_distance">
<em class="property">static </em><code class="descname">squared_distance</code><span class="sig-paren">(</span><em>v1</em>, <em>v2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance between two vectors.
a and b can be of type SparseVector, DenseVector, np.ndarray
or array.array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">51.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.stringify">
<em class="property">static </em><code class="descname">stringify</code><span class="sig-paren">(</span><em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.stringify"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.stringify" title="Permalink to this definition"></a></dt>
<dd><p>Converts a vector into a string, which can be recognized by
Vectors.parse().</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">stringify</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="go">&#39;(2,[1],[1.0])&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">stringify</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))</span>
<span class="go">&#39;[0.0,1.0]&#39;</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Vectors.zeros">
<em class="property">static </em><code class="descname">zeros</code><span class="sig-paren">(</span><em>size</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Vectors.zeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Vectors.zeros" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.Matrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">Matrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<dl class="method">
<dt id="pyspark.mllib.linalg.Matrix.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrix.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrix.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.Matrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns its elements in a NumPy ndarray.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.DenseMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">DenseMatrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>values</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.Matrix" title="pyspark.mllib.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Matrix</span></code></a></p>
<p>Column-major dense matrix.</p>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseMatrix.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseMatrix.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseMatrix.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.DenseMatrix" title="pyspark.ml.linalg.DenseMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.DenseMatrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseMatrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseMatrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseMatrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Return an numpy.ndarray</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">m</span> <span class="o">=</span> <span class="n">DenseMatrix</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">m</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span>
<span class="go">array([[ 0., 2.],</span>
<span class="go"> [ 1., 3.]])</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.DenseMatrix.toSparse">
<code class="descname">toSparse</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#DenseMatrix.toSparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.DenseMatrix.toSparse" title="Permalink to this definition"></a></dt>
<dd><p>Convert to SparseMatrix</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.SparseMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">SparseMatrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>colPtrs</em>, <em>rowIndices</em>, <em>values</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.Matrix" title="pyspark.mllib.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Matrix</span></code></a></p>
<p>Sparse Matrix stored in CSC format.</p>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseMatrix.asML">
<code class="descname">asML</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseMatrix.asML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseMatrix.asML" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.SparseMatrix" title="pyspark.ml.linalg.SparseMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.SparseMatrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseMatrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseMatrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseMatrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Return an numpy.ndarray</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.SparseMatrix.toDense">
<code class="descname">toDense</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#SparseMatrix.toDense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.SparseMatrix.toDense" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.Matrices">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">Matrices</code><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrices"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrices" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Matrices.dense">
<em class="property">static </em><code class="descname">dense</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>values</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrices.dense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrices.dense" title="Permalink to this definition"></a></dt>
<dd><p>Create a DenseMatrix</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Matrices.fromML">
<em class="property">static </em><code class="descname">fromML</code><span class="sig-paren">(</span><em>mat</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrices.fromML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrices.fromML" title="Permalink to this definition"></a></dt>
<dd><p>Convert a matrix from the new mllib-local representation.
This does NOT copy the data; it copies references.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>mat</strong> – a <a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Matrix" title="pyspark.ml.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Matrix</span></code></a></td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a <a class="reference internal" href="#pyspark.mllib.linalg.Matrix" title="pyspark.mllib.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Matrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.linalg.Matrices.sparse">
<em class="property">static </em><code class="descname">sparse</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>colPtrs</em>, <em>rowIndices</em>, <em>values</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#Matrices.sparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.Matrices.sparse" title="Permalink to this definition"></a></dt>
<dd><p>Create a SparseMatrix</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.QRDecomposition">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.</code><code class="descname">QRDecomposition</code><span class="sig-paren">(</span><em>Q</em>, <em>R</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg.html#QRDecomposition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.QRDecomposition" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Represents QR factors.</p>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.QRDecomposition.Q">
<code class="descname">Q</code><a class="headerlink" href="#pyspark.mllib.linalg.QRDecomposition.Q" title="Permalink to this definition"></a></dt>
<dd><p>An orthogonal matrix Q in a QR decomposition.
May be null if not computed.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.QRDecomposition.R">
<code class="descname">R</code><a class="headerlink" href="#pyspark.mllib.linalg.QRDecomposition.R" title="Permalink to this definition"></a></dt>
<dd><p>An upper triangular matrix R in a QR decomposition.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.linalg.distributed">
<span id="pyspark-mllib-linalg-distributed-module"></span><h2>pyspark.mllib.linalg.distributed module<a class="headerlink" href="#module-pyspark.mllib.linalg.distributed" title="Permalink to this headline"></a></h2>
<p>Package for distributed linear algebra.</p>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">BlockMatrix</code><span class="sig-paren">(</span><em>blocks</em>, <em>rowsPerBlock</em>, <em>colsPerBlock</em>, <em>numRows=0</em>, <em>numCols=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.distributed.DistributedMatrix" title="pyspark.mllib.linalg.distributed.DistributedMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.distributed.DistributedMatrix</span></code></a></p>
<p>Represents a distributed matrix in blocks of local matrices.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>blocks</strong> – An RDD of sub-matrix blocks
((blockRowIndex, blockColIndex), sub-matrix) that
form this distributed matrix. If multiple blocks
with the same index exist, the results for
operations like add and multiply will be
unpredictable.</li>
<li><strong>rowsPerBlock</strong> – Number of rows that make up each block.
The blocks forming the final rows are not
required to have the given number of rows.</li>
<li><strong>colsPerBlock</strong> – Number of columns that make up each block.
The blocks forming the final columns are not
required to have the given number of columns.</li>
<li><strong>numRows</strong> – Number of rows of this matrix. If the supplied
value is less than or equal to zero, the number
of rows will be calculated when <cite>numRows</cite> is
invoked.</li>
<li><strong>numCols</strong> – Number of columns of this matrix. If the supplied
value is less than or equal to zero, the number
of columns will be calculated when <cite>numCols</cite> is
invoked.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.add">
<code class="descname">add</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.add"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.add" title="Permalink to this definition"></a></dt>
<dd><p>Adds two block matrices together. The matrices must have the
same size and matching <cite>rowsPerBlock</cite> and <cite>colsPerBlock</cite> values.
If one of the sub matrix blocks that are being added is a
SparseMatrix, the resulting sub matrix block will also be a
SparseMatrix, even if it is being added to a DenseMatrix. If
two dense sub matrix blocks are added, the output block will
also be a DenseMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dm1</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dm2</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sm</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm1</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks2</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm1</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks3</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">sm</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat2</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat3</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">mat2</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">mat3</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0)</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.blocks">
<code class="descname">blocks</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.blocks" title="Permalink to this definition"></a></dt>
<dd><p>The RDD of sub-matrix blocks
((blockRowIndex, blockColIndex), sub-matrix) that form this
distributed matrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))]),</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">blocks</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="go">((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0))</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.cache">
<code class="descname">cache</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.cache"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.cache" title="Permalink to this definition"></a></dt>
<dd><p>Caches the underlying RDD.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.colsPerBlock">
<code class="descname">colsPerBlock</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.colsPerBlock" title="Permalink to this definition"></a></dt>
<dd><p>Number of columns that make up each block.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">colsPerBlock</span>
<span class="go">2</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.multiply">
<code class="descname">multiply</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.multiply"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.multiply" title="Permalink to this definition"></a></dt>
<dd><p>Left multiplies this BlockMatrix by <cite>other</cite>, another
BlockMatrix. The <cite>colsPerBlock</cite> of this matrix must equal the
<cite>rowsPerBlock</cite> of <cite>other</cite>. If <cite>other</cite> contains any SparseMatrix
blocks, they will have to be converted to DenseMatrix blocks.
The output BlockMatrix will only consist of DenseMatrix blocks.
This may cause some performance issues until support for
multiplying two sparse matrices is added.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dm1</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dm2</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dm3</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dm4</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sm</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm1</span><span class="p">),</span> <span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks2</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm3</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm4</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks3</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">sm</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm4</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat2</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat3</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span><span class="o">.</span><span class="n">multiply</span><span class="p">(</span><span class="n">mat2</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(2, 2, [242.0, 272.0, 350.0, 398.0], 0)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span><span class="o">.</span><span class="n">multiply</span><span class="p">(</span><span class="n">mat3</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(2, 2, [227.0, 258.0, 394.0, 450.0], 0)</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.numColBlocks">
<code class="descname">numColBlocks</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.numColBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Number of columns of blocks in the BlockMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">numColBlocks</span>
<span class="go">1</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.numCols">
<code class="descname">numCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.numCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.numCols" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of cols.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">2</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.numRowBlocks">
<code class="descname">numRowBlocks</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.numRowBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Number of rows of blocks in the BlockMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">numRowBlocks</span>
<span class="go">2</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.numRows">
<code class="descname">numRows</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.numRows"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.numRows" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of rows.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.persist">
<code class="descname">persist</code><span class="sig-paren">(</span><em>storageLevel</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.persist"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.persist" title="Permalink to this definition"></a></dt>
<dd><p>Persists the underlying RDD with the specified storage level.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.rowsPerBlock">
<code class="descname">rowsPerBlock</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.rowsPerBlock" title="Permalink to this definition"></a></dt>
<dd><p>Number of rows that make up each block.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">rowsPerBlock</span>
<span class="go">3</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.subtract">
<code class="descname">subtract</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.subtract"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.subtract" title="Permalink to this definition"></a></dt>
<dd><p>Subtracts the given block matrix <cite>other</cite> from this block matrix:
<cite>this - other</cite>. The matrices must have the same size and
matching <cite>rowsPerBlock</cite> and <cite>colsPerBlock</cite> values. If one of
the sub matrix blocks that are being subtracted is a
SparseMatrix, the resulting sub matrix block will also be a
SparseMatrix, even if it is being subtracted from a DenseMatrix.
If two dense sub matrix blocks are subtracted, the output block
will also be a DenseMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dm1</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dm2</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sm</span> <span class="o">=</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm1</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks2</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm1</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blocks3</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">sm</span><span class="p">),</span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dm2</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat2</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat3</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat1</span><span class="o">.</span><span class="n">subtract</span><span class="p">(</span><span class="n">mat2</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(6, 2, [-4.0, -7.0, -4.0, 4.0, 7.0, 4.0, -6.0, -5.0, -10.0, 6.0, 5.0, 10.0], 0)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat2</span><span class="o">.</span><span class="n">subtract</span><span class="p">(</span><span class="n">mat3</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(6, 2, [6.0, 8.0, 9.0, -4.0, -7.0, -4.0, 10.0, 9.0, 9.0, -6.0, -5.0, -10.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.toCoordinateMatrix">
<code class="descname">toCoordinateMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.toCoordinateMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.toCoordinateMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a CoordinateMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toCoordinateMatrix</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">entries</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="go">[MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 2.0), MatrixEntry(1, 0, 7.0)]</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.toIndexedRowMatrix">
<code class="descname">toIndexedRowMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.toIndexedRowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.toIndexedRowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to an IndexedRowMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toIndexedRowMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This BlockMatrix will have 6 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># having two sub-matrix blocks stacked, each with 3 rows.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># The ensuing IndexedRowMatrix will also have 6 rows.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This BlockMatrix will have 2 effective columns, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># having two sub-matrix blocks stacked, each with 2 columns.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># The ensuing IndexedRowMatrix will also have 2 columns.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">2</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.toLocalMatrix">
<code class="descname">toLocalMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.toLocalMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.toLocalMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Collect the distributed matrix on the driver as a DenseMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This BlockMatrix will have 6 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># having two sub-matrix blocks stacked, each with 3 rows.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># The ensuing DenseMatrix will also have 6 rows.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">)</span>
<span class="go">6</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This BlockMatrix will have 2 effective columns, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># having two sub-matrix blocks stacked, each with 2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># columns. The ensuing DenseMatrix will also have 2 columns.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">)</span>
<span class="go">2</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.transpose">
<code class="descname">transpose</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.transpose"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.transpose" title="Permalink to this definition"></a></dt>
<dd><p>Transpose this BlockMatrix. Returns a new BlockMatrix
instance sharing the same underlying data. Is a lazy operation.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">blocks</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat_transposed</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">transpose</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat_transposed</span><span class="o">.</span><span class="n">toLocalMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.BlockMatrix.validate">
<code class="descname">validate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#BlockMatrix.validate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.BlockMatrix.validate" title="Permalink to this definition"></a></dt>
<dd><p>Validates the block matrix info against the matrix data (<cite>blocks</cite>)
and throws an exception if any error is found.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">CoordinateMatrix</code><span class="sig-paren">(</span><em>entries</em>, <em>numRows=0</em>, <em>numCols=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.distributed.DistributedMatrix" title="pyspark.mllib.linalg.distributed.DistributedMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.distributed.DistributedMatrix</span></code></a></p>
<p>Represents a matrix in coordinate format.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>entries</strong> – An RDD of MatrixEntry inputs or
(long, long, float) tuples.</li>
<li><strong>numRows</strong> – Number of rows in the matrix. A non-positive
value means unknown, at which point the number
of rows will be determined by the max row
index plus one.</li>
<li><strong>numCols</strong> – Number of columns in the matrix. A non-positive
value means unknown, at which point the number
of columns will be determined by the max row
index plus one.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.entries">
<code class="descname">entries</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.entries" title="Permalink to this definition"></a></dt>
<dd><p>Entries of the CoordinateMatrix stored as an RDD of
MatrixEntries.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mf">2.1</span><span class="p">)]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">entries</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="go">MatrixEntry(0, 0, 1.2)</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.numCols">
<code class="descname">numCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.numCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.numCols" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of cols.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.7</span><span class="p">)])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">2</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.numRows">
<code class="descname">numRows</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.numRows"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.numRows" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of rows.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.7</span><span class="p">)])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.toBlockMatrix">
<code class="descname">toBlockMatrix</code><span class="sig-paren">(</span><em>rowsPerBlock=1024</em>, <em>colsPerBlock=1024</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.toBlockMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.toBlockMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a BlockMatrix.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rowsPerBlock</strong> – Number of rows that make up each block.
The blocks forming the final rows are not
required to have the given number of rows.</li>
<li><strong>colsPerBlock</strong> – Number of columns that make up each block.
The blocks forming the final columns are not
required to have the given number of columns.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mf">2.1</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span><span class="o">.</span><span class="n">toBlockMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 7 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># the highest row index being 6, and the ensuing</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># BlockMatrix will have 7 rows as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 5 columns, due to the</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># highest column index being 4, and the ensuing</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># BlockMatrix will have 5 columns as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">5</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.toIndexedRowMatrix">
<code class="descname">toIndexedRowMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.toIndexedRowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.toIndexedRowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to an IndexedRowMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mf">2.1</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span><span class="o">.</span><span class="n">toIndexedRowMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 7 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># the highest row index being 6, and the ensuing</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># IndexedRowMatrix will have 7 rows as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 5 columns, due to the</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># highest column index being 4, and the ensuing</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># IndexedRowMatrix will have 5 columns as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">5</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.toRowMatrix">
<code class="descname">toRowMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.toRowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.toRowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a RowMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mf">2.1</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span><span class="o">.</span><span class="n">toRowMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 7 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># the highest row index being 6, but the ensuing RowMatrix</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># will only have 2 rows since there are only entries on 2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># unique rows.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">2</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This CoordinateMatrix will have 5 columns, due to the</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># highest column index being 4, and the ensuing RowMatrix</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># will have 5 columns as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">5</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.CoordinateMatrix.transpose">
<code class="descname">transpose</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#CoordinateMatrix.transpose"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.CoordinateMatrix.transpose" title="Permalink to this definition"></a></dt>
<dd><p>Transpose this CoordinateMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">entries</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">1.2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.7</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat_transposed</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">transpose</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat_transposed</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">2</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat_transposed</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.DistributedMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">DistributedMatrix</code><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#DistributedMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.DistributedMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Represents a distributively stored matrix backed by one or
more RDDs.</p>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.DistributedMatrix.numCols">
<code class="descname">numCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#DistributedMatrix.numCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.DistributedMatrix.numCols" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of cols.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.DistributedMatrix.numRows">
<code class="descname">numRows</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#DistributedMatrix.numRows"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.DistributedMatrix.numRows" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of rows.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.IndexedRow">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">IndexedRow</code><span class="sig-paren">(</span><em>index</em>, <em>vector</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRow"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRow" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Represents a row of an IndexedRowMatrix.</p>
<p>Just a wrapper over a (long, vector) tuple.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>index</strong> – The index for the given row.</li>
<li><strong>vector</strong> – The row in the matrix at the given index.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">IndexedRowMatrix</code><span class="sig-paren">(</span><em>rows</em>, <em>numRows=0</em>, <em>numCols=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.distributed.DistributedMatrix" title="pyspark.mllib.linalg.distributed.DistributedMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.distributed.DistributedMatrix</span></code></a></p>
<p>Represents a row-oriented distributed Matrix with indexed rows.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rows</strong> – An RDD of IndexedRows or (long, vector) tuples.</li>
<li><strong>numRows</strong> – Number of rows in the matrix. A non-positive
value means unknown, at which point the number
of rows will be determined by the max row
index plus one.</li>
<li><strong>numCols</strong> – Number of columns in the matrix. A non-positive
value means unknown, at which point the number
of columns will be determined by the size of
the first row.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.columnSimilarities">
<code class="descname">columnSimilarities</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.columnSimilarities"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.columnSimilarities" title="Permalink to this definition"></a></dt>
<dd><p>Compute all cosine similarities between columns.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cs</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">columnSimilarities</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">cs</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeGramianMatrix">
<code class="descname">computeGramianMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.computeGramianMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeGramianMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Computes the Gramian matrix <cite>A^T A</cite>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This cannot be computed on matrices with more than 65535 columns.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">computeGramianMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeSVD">
<code class="descname">computeSVD</code><span class="sig-paren">(</span><em>k</em>, <em>computeU=False</em>, <em>rCond=1e-09</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.computeSVD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeSVD" title="Permalink to this definition"></a></dt>
<dd><p>Computes the singular value decomposition of the IndexedRowMatrix.</p>
<p>The given row matrix A of dimension (m X n) is decomposed into
U * s * V’T where</p>
<ul class="simple">
<li><dl class="first docutils">
<dt>U: (m X k) (left singular vectors) is a IndexedRowMatrix</dt>
<dd>whose columns are the eigenvectors of (A X A’)</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>s: DenseVector consisting of square root of the eigenvalues</dt>
<dd>(singular values) in descending order.</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>v: (n X k) (right singular vectors) is a Matrix whose columns</dt>
<dd>are the eigenvectors of (A’ X A)</dd>
</dl>
</li>
</ul>
<p>For more specific details on implementation, please refer
the scala documentation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>k</strong> – Number of leading singular values to keep (<cite>0 &lt; k &lt;= n</cite>).
It might return less than k if there are numerically zero singular values
or there are not enough Ritz values converged before the maximum number of
Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).</li>
<li><strong>computeU</strong> – Whether or not to compute U. If set to be
True, then U is computed by A * V * s^-1</li>
<li><strong>rCond</strong> – Reciprocal condition number. All singular values
smaller than rCond * s[0] are treated as zero
where s[0] is the largest singular value.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">SingularValueDecomposition object</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">))]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">rows</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span> <span class="o">=</span> <span class="n">irm</span><span class="o">.</span><span class="n">computeSVD</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">U</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[IndexedRow(0, [-0.707106781187,0.707106781187]), IndexedRow(1, [-0.707106781187,-0.707106781187])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">s</span>
<span class="go">DenseVector([3.4641, 3.1623])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">V</span>
<span class="go">DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.multiply">
<code class="descname">multiply</code><span class="sig-paren">(</span><em>matrix</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.multiply"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.multiply" title="Permalink to this definition"></a></dt>
<dd><p>Multiply this matrix by a local dense matrix on the right.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>matrix</strong> – a local dense matrix whose number of rows must match the number of columns
of this matrix</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix" title="pyspark.mllib.linalg.distributed.IndexedRowMatrix"><code class="xref py py-class docutils literal"><span class="pre">IndexedRowMatrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([(</span><span class="mi">0</span><span class="p">,</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">))]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">multiply</span><span class="p">(</span><span class="n">DenseMatrix</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">]))</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.numCols">
<code class="descname">numCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.numCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.numCols" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of cols.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.numRows">
<code class="descname">numRows</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.numRows"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.numRows" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of rows.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">])])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.rows">
<code class="descname">rows</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.rows" title="Permalink to this definition"></a></dt>
<dd><p>Rows of the IndexedRowMatrix stored as an RDD of IndexedRows.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">rows</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="go">IndexedRow(0, [1.0,2.0,3.0])</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.toBlockMatrix">
<code class="descname">toBlockMatrix</code><span class="sig-paren">(</span><em>rowsPerBlock=1024</em>, <em>colsPerBlock=1024</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.toBlockMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toBlockMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a BlockMatrix.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rowsPerBlock</strong> – Number of rows that make up each block.
The blocks forming the final rows are not
required to have the given number of rows.</li>
<li><strong>colsPerBlock</strong> – Number of columns that make up each block.
The blocks forming the final columns are not
required to have the given number of columns.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span><span class="o">.</span><span class="n">toBlockMatrix</span><span class="p">()</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># This IndexedRowMatrix will have 7 effective rows, due to</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># the highest row index being 6, and the ensuing</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># BlockMatrix will have 7 rows as well.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.toCoordinateMatrix">
<code class="descname">toCoordinateMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.toCoordinateMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toCoordinateMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a CoordinateMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span><span class="o">.</span><span class="n">toCoordinateMatrix</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">entries</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="go">[MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 0.0), MatrixEntry(6, 0, 0.0)]</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.IndexedRowMatrix.toRowMatrix">
<code class="descname">toRowMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#IndexedRowMatrix.toRowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toRowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Convert this matrix to a RowMatrix.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">IndexedRow</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">IndexedRow</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span><span class="o">.</span><span class="n">toRowMatrix</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0])]</span>
</pre></div>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.MatrixEntry">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">MatrixEntry</code><span class="sig-paren">(</span><em>i</em>, <em>j</em>, <em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#MatrixEntry"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.MatrixEntry" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">object</span></code></p>
<p>Represents an entry of a CoordinateMatrix.</p>
<p>Just a wrapper over a (long, long, float) tuple.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>i</strong> – The row index of the matrix.</li>
<li><strong>j</strong> – The column index of the matrix.</li>
<li><strong>value</strong> – The (i, j)th entry of the matrix, as a float.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">RowMatrix</code><span class="sig-paren">(</span><em>rows</em>, <em>numRows=0</em>, <em>numCols=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#pyspark.mllib.linalg.distributed.DistributedMatrix" title="pyspark.mllib.linalg.distributed.DistributedMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.distributed.DistributedMatrix</span></code></a></p>
<p>Represents a row-oriented distributed Matrix with no meaningful
row indices.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>rows</strong> – An RDD of vectors.</li>
<li><strong>numRows</strong> – Number of rows in the matrix. A non-positive
value means unknown, at which point the number
of rows will be determined by the number of
records in the <cite>rows</cite> RDD.</li>
<li><strong>numCols</strong> – Number of columns in the matrix. A non-positive
value means unknown, at which point the number
of columns will be determined by the size of
the first row.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.columnSimilarities">
<code class="descname">columnSimilarities</code><span class="sig-paren">(</span><em>threshold=0.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.columnSimilarities"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.columnSimilarities" title="Permalink to this definition"></a></dt>
<dd><p>Compute similarities between columns of this matrix.</p>
<p>The threshold parameter is a trade-off knob between estimate
quality and computational cost.</p>
<p>The default threshold setting of 0 guarantees deterministically
correct results, but uses the brute-force approach of computing
normalized dot products.</p>
<p>Setting the threshold to positive values uses a sampling
approach and incurs strictly less computational cost than the
brute-force approach. However the similarities computed will
be estimates.</p>
<p>The sampling guarantees relative-error correctness for those
pairs of columns that have similarity greater than the given
similarity threshold.</p>
<dl class="docutils">
<dt>To describe the guarantee, we set some notation:</dt>
<dd><ul class="first last simple">
<li>Let A be the smallest in magnitude non-zero element of
this matrix.</li>
<li>Let B be the largest in magnitude non-zero element of
this matrix.</li>
<li>Let L be the maximum number of non-zeros per row.</li>
</ul>
</dd>
</dl>
<p>For example, for {0,1} matrices: A=B=1.
Another example, for the Netflix matrix: A=1, B=5</p>
<p>For those column pairs that are above the threshold, the
computed similarity is correct to within 20% relative error
with probability at least 1 - (0.981)^10/B^</p>
<p>The shuffle size is bounded by the <em>smaller</em> of the following
two expressions:</p>
<blockquote>
<div><ul class="simple">
<li>O(n log(n) L / (threshold * A))</li>
<li>O(m L^2^)</li>
</ul>
</div></blockquote>
<p>The latter is the cost of the brute-force approach, so for
non-zero thresholds, the cost is always cheaper than the
brute-force approach.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">threshold: Set to 0 for deterministic guaranteed
correctness. Similarities above this
threshold are estimated with the cost vs
estimate quality trade-off described above.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">An n x n sparse upper-triangular CoordinateMatrix of
cosine similarities between columns of this matrix.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sims</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">columnSimilarities</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sims</span><span class="o">.</span><span class="n">entries</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">value</span>
<span class="go">0.91914503...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.computeColumnSummaryStatistics">
<code class="descname">computeColumnSummaryStatistics</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.computeColumnSummaryStatistics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.computeColumnSummaryStatistics" title="Permalink to this definition"></a></dt>
<dd><p>Computes column-wise summary statistics.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"><code class="xref py py-class docutils literal"><span class="pre">MultivariateStatisticalSummary</span></code> object
containing column-wise summary statistics.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">colStats</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">computeColumnSummaryStatistics</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">colStats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="go">array([ 2.5, 3.5, 4.5])</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.computeCovariance">
<code class="descname">computeCovariance</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.computeCovariance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.computeCovariance" title="Permalink to this definition"></a></dt>
<dd><p>Computes the covariance matrix, treating each row as an
observation.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This cannot be computed on matrices with more than 65535 columns.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">computeCovariance</span><span class="p">()</span>
<span class="go">DenseMatrix(2, 2, [0.5, -0.5, -0.5, 0.5], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.computeGramianMatrix">
<code class="descname">computeGramianMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.computeGramianMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.computeGramianMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Computes the Gramian matrix <cite>A^T A</cite>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This cannot be computed on matrices with more than 65535 columns.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">computeGramianMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.computePrincipalComponents">
<code class="descname">computePrincipalComponents</code><span class="sig-paren">(</span><em>k</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.computePrincipalComponents"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.computePrincipalComponents" title="Permalink to this definition"></a></dt>
<dd><p>Computes the k principal components of the given row matrix</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This cannot be computed on matrices with more than 65535 columns.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>k</strong> – Number of principal components to keep.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="#pyspark.mllib.linalg.DenseMatrix" title="pyspark.mllib.linalg.DenseMatrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.DenseMatrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">1</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rm</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Returns the two principal components of rm</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pca</span> <span class="o">=</span> <span class="n">rm</span><span class="o">.</span><span class="n">computePrincipalComponents</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pca</span>
<span class="go">DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Transform into new dimensions with the greatest variance.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rm</span><span class="o">.</span><span class="n">multiply</span><span class="p">(</span><span class="n">pca</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), DenseVector([-4.6102, -4.9745])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.computeSVD">
<code class="descname">computeSVD</code><span class="sig-paren">(</span><em>k</em>, <em>computeU=False</em>, <em>rCond=1e-09</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.computeSVD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.computeSVD" title="Permalink to this definition"></a></dt>
<dd><p>Computes the singular value decomposition of the RowMatrix.</p>
<p>The given row matrix A of dimension (m X n) is decomposed into
U * s * V’T where</p>
<ul class="simple">
<li><dl class="first docutils">
<dt>U: (m X k) (left singular vectors) is a RowMatrix whose</dt>
<dd>columns are the eigenvectors of (A X A’)</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>s: DenseVector consisting of square root of the eigenvalues</dt>
<dd>(singular values) in descending order.</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>v: (n X k) (right singular vectors) is a Matrix whose columns</dt>
<dd>are the eigenvectors of (A’ X A)</dd>
</dl>
</li>
</ul>
<p>For more specific details on implementation, please refer
the Scala documentation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>k</strong> – Number of leading singular values to keep (<cite>0 &lt; k &lt;= n</cite>).
It might return less than k if there are numerically zero singular values
or there are not enough Ritz values converged before the maximum number of
Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).</li>
<li><strong>computeU</strong> – Whether or not to compute U. If set to be
True, then U is computed by A * V * s^-1</li>
<li><strong>rCond</strong> – Reciprocal condition number. All singular values
smaller than rCond * s[0] are treated as zero
where s[0] is the largest singular value.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><a class="reference internal" href="#pyspark.mllib.linalg.distributed.SingularValueDecomposition" title="pyspark.mllib.linalg.distributed.SingularValueDecomposition"><code class="xref py py-class docutils literal"><span class="pre">SingularValueDecomposition</span></code></a></p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rm</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span> <span class="o">=</span> <span class="n">rm</span><span class="o">.</span><span class="n">computeSVD</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">U</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">s</span>
<span class="go">DenseVector([3.4641, 3.1623])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svd_model</span><span class="o">.</span><span class="n">V</span>
<span class="go">DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.multiply">
<code class="descname">multiply</code><span class="sig-paren">(</span><em>matrix</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.multiply"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.multiply" title="Permalink to this definition"></a></dt>
<dd><p>Multiply this matrix by a local dense matrix on the right.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>matrix</strong> – a local dense matrix whose number of rows must match the number of columns
of this matrix</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="#pyspark.mllib.linalg.distributed.RowMatrix" title="pyspark.mllib.linalg.distributed.RowMatrix"><code class="xref py py-class docutils literal"><span class="pre">RowMatrix</span></code></a></td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rm</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rm</span><span class="o">.</span><span class="n">multiply</span><span class="p">(</span><span class="n">DenseMatrix</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">]))</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.numCols">
<code class="descname">numCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.numCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.numCols" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of cols.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">3</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numCols</span><span class="p">())</span>
<span class="go">6</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.numRows">
<code class="descname">numRows</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.numRows"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.numRows" title="Permalink to this definition"></a></dt>
<dd><p>Get or compute the number of rows.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="mi">12</span><span class="p">]])</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">numRows</span><span class="p">())</span>
<span class="go">7</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.rows">
<code class="descname">rows</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.rows" title="Permalink to this definition"></a></dt>
<dd><p>Rows of the RowMatrix stored as an RDD of vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">rows</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="go">DenseVector([1.0, 2.0, 3.0])</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.linalg.distributed.RowMatrix.tallSkinnyQR">
<code class="descname">tallSkinnyQR</code><span class="sig-paren">(</span><em>computeQ=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#RowMatrix.tallSkinnyQR"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.RowMatrix.tallSkinnyQR" title="Permalink to this definition"></a></dt>
<dd><p>Compute the QR decomposition of this RowMatrix.</p>
<p>The implementation is designed to optimize the QR decomposition
(factorization) for the RowMatrix of a tall and skinny shape.</p>
<dl class="docutils">
<dt>Reference:</dt>
<dd>Paul G. Constantine, David F. Gleich. “Tall and skinny QR
factorizations in MapReduce architectures”
([[<a class="reference external" href="http://dx.doi.org/10.1145/1996092.1996103">http://dx.doi.org/10.1145/1996092.1996103</a>]])</dd>
</dl>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">computeQ: whether to computeQ</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">QRDecomposition(Q: RowMatrix, R: Matrix), where
Q = None if computeQ = false.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mi">3</span><span class="p">,</span> <span class="o">-</span><span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="o">-</span><span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">rows</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">decomp</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">tallSkinnyQR</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Q</span> <span class="o">=</span> <span class="n">decomp</span><span class="o">.</span><span class="n">Q</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">R</span> <span class="o">=</span> <span class="n">decomp</span><span class="o">.</span><span class="n">R</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Test with absolute values</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">absQRows</span> <span class="o">=</span> <span class="n">Q</span><span class="o">.</span><span class="n">rows</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="nb">abs</span><span class="p">(</span><span class="n">row</span><span class="o">.</span><span class="n">toArray</span><span class="p">())</span><span class="o">.</span><span class="n">tolist</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">absQRows</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Test with absolute values</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">R</span><span class="o">.</span><span class="n">toArray</span><span class="p">())</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="go">[[5.0, 10.0], [0.0, 1.0]]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.linalg.distributed.SingularValueDecomposition">
<em class="property">class </em><code class="descclassname">pyspark.mllib.linalg.distributed.</code><code class="descname">SingularValueDecomposition</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/linalg/distributed.html#SingularValueDecomposition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.linalg.distributed.SingularValueDecomposition" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.common.JavaModelWrapper</span></code></p>
<p>Represents singular value decomposition (SVD) factors.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.SingularValueDecomposition.U">
<code class="descname">U</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.SingularValueDecomposition.U" title="Permalink to this definition"></a></dt>
<dd><p>Returns a distributed matrix whose columns are the left
singular vectors of the SingularValueDecomposition if computeU was set to be True.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.SingularValueDecomposition.V">
<code class="descname">V</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.SingularValueDecomposition.V" title="Permalink to this definition"></a></dt>
<dd><p>Returns a DenseMatrix whose columns are the right singular
vectors of the SingularValueDecomposition.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.linalg.distributed.SingularValueDecomposition.s">
<code class="descname">s</code><a class="headerlink" href="#pyspark.mllib.linalg.distributed.SingularValueDecomposition.s" title="Permalink to this definition"></a></dt>
<dd><p>Returns a DenseVector with singular values in descending order.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.random">
<span id="pyspark-mllib-random-module"></span><h2>pyspark.mllib.random module<a class="headerlink" href="#module-pyspark.mllib.random" title="Permalink to this headline"></a></h2>
<p>Python package for random data generation.</p>
<dl class="class">
<dt id="pyspark.mllib.random.RandomRDDs">
<em class="property">class </em><code class="descclassname">pyspark.mllib.random.</code><code class="descname">RandomRDDs</code><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs" title="Permalink to this definition"></a></dt>
<dd><p>Generator methods for creating RDDs comprised of i.i.d samples from
some distribution.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.exponentialRDD">
<em class="property">static </em><code class="descname">exponentialRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.exponentialRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.exponentialRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the Exponential
distribution with the input mean.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – Mean, or 1 / lambda, for the Exponential distribution.</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ Exp(mean).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">exponentialRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">1000</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">mean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span> <span class="o">-</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">mean</span><span class="p">))</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.exponentialVectorRDD">
<em class="property">static </em><code class="descname">exponentialVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.exponentialVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.exponentialVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Exponential distribution with the input mean.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – Mean, or 1 / lambda, for the Exponential distribution.</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>)</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">0.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">exponentialVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mat</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(100, 100)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">mean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">std</span><span class="p">()</span> <span class="o">-</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">mean</span><span class="p">))</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.gammaRDD">
<em class="property">static </em><code class="descname">gammaRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>shape</em>, <em>scale</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.gammaRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.gammaRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the Gamma
distribution with the input shape and scale.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>shape</strong> – shape (&gt; 0) parameter for the Gamma distribution</li>
<li><strong>scale</strong> – scale (&gt; 0) parameter for the Gamma distribution</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">shape</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scale</span> <span class="o">=</span> <span class="mf">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expMean</span> <span class="o">=</span> <span class="n">shape</span> <span class="o">*</span> <span class="n">scale</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expStd</span> <span class="o">=</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">shape</span> <span class="o">*</span> <span class="n">scale</span> <span class="o">*</span> <span class="n">scale</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">gammaRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">shape</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">1000</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">expMean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span> <span class="o">-</span> <span class="n">expStd</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.gammaVectorRDD">
<em class="property">static </em><code class="descname">gammaVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>shape</em>, <em>scale</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.gammaVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.gammaVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Gamma distribution.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>shape</strong> – Shape (&gt; 0) of the Gamma distribution</li>
<li><strong>scale</strong> – Scale (&gt; 0) of the Gamma distribution</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">shape</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scale</span> <span class="o">=</span> <span class="mf">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expMean</span> <span class="o">=</span> <span class="n">shape</span> <span class="o">*</span> <span class="n">scale</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expStd</span> <span class="o">=</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">shape</span> <span class="o">*</span> <span class="n">scale</span> <span class="o">*</span> <span class="n">scale</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">matrix</span><span class="p">(</span><span class="n">RandomRDDs</span><span class="o">.</span><span class="n">gammaVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">shape</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(100, 100)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">expMean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">std</span><span class="p">()</span> <span class="o">-</span> <span class="n">expStd</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.logNormalRDD">
<em class="property">static </em><code class="descname">logNormalRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>std</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.logNormalRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.logNormalRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the log normal
distribution with the input mean and standard distribution.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – mean for the log Normal distribution</li>
<li><strong>std</strong> – std for the log Normal distribution</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ log N(mean, std).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span><span class="p">,</span> <span class="n">exp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">std</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expMean</span> <span class="o">=</span> <span class="n">exp</span><span class="p">(</span><span class="n">mean</span> <span class="o">+</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expStd</span> <span class="o">=</span> <span class="n">sqrt</span><span class="p">((</span><span class="n">exp</span><span class="p">(</span><span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">)</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">*</span> <span class="n">exp</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">*</span> <span class="n">mean</span> <span class="o">+</span> <span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">logNormalRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="n">std</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">1000</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">expMean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span> <span class="o">-</span> <span class="n">expStd</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.logNormalVectorRDD">
<em class="property">static </em><code class="descname">logNormalVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>std</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.logNormalVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.logNormalVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the log normal distribution.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – Mean of the log normal distribution</li>
<li><strong>std</strong> – Standard Deviation of the log normal distribution</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d. samples ~ log <cite>N(mean, std)</cite>.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span><span class="p">,</span> <span class="n">exp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">std</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expMean</span> <span class="o">=</span> <span class="n">exp</span><span class="p">(</span><span class="n">mean</span> <span class="o">+</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expStd</span> <span class="o">=</span> <span class="n">sqrt</span><span class="p">((</span><span class="n">exp</span><span class="p">(</span><span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">)</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">*</span> <span class="n">exp</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">*</span> <span class="n">mean</span> <span class="o">+</span> <span class="n">std</span> <span class="o">*</span> <span class="n">std</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">m</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">logNormalVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="n">std</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">matrix</span><span class="p">(</span><span class="n">m</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(100, 100)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">expMean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">std</span><span class="p">()</span> <span class="o">-</span> <span class="n">expStd</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.normalRDD">
<em class="property">static </em><code class="descname">normalRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.normalRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.normalRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the standard normal
distribution.</p>
<p>To transform the distribution in the generated RDD from standard normal
to some other normal N(mean, sigma^2), use
<cite>RandomRDDs.normal(sc, n, p, seed) .map(lambda v: mean + sigma * v)</cite></p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">normalRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">1000</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="mf">0.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.normalVectorRDD">
<em class="property">static </em><code class="descname">normalVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.normalVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.normalVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the standard normal distribution.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d. samples ~ <cite>N(0.0, 1.0)</cite>.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">matrix</span><span class="p">(</span><span class="n">RandomRDDs</span><span class="o">.</span><span class="n">normalVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(100, 100)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="mf">0.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">std</span><span class="p">()</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.1</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.poissonRDD">
<em class="property">static </em><code class="descname">poissonRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.poissonRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.poissonRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the Poisson
distribution with the input mean.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – Mean, or lambda, for the Poisson distribution.</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ Pois(mean).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">100.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">poissonRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="mi">1000</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">1000</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">mean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">stats</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span> <span class="o">-</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">mean</span><span class="p">))</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.poissonVectorRDD">
<em class="property">static </em><code class="descname">poissonVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>mean</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.poissonVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.poissonVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Poisson distribution with the input mean.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>mean</strong> – Mean, or lambda, for the Poisson distribution.</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>)</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean).</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mean</span> <span class="o">=</span> <span class="mf">100.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">poissonVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mat</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(100, 100)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="o">-</span> <span class="n">mean</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">sqrt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">mat</span><span class="o">.</span><span class="n">std</span><span class="p">()</span> <span class="o">-</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">mean</span><span class="p">))</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.uniformRDD">
<em class="property">static </em><code class="descname">uniformRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>size</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.uniformRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.uniformRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of i.i.d. samples from the
uniform distribution U(0.0, 1.0).</p>
<p>To transform the distribution in the generated RDD from U(0.0, 1.0)
to U(a, b), use
<cite>RandomRDDs.uniformRDD(sc, n, p, seed) .map(lambda v: a + (b - a) * v)</cite></p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>size</strong> – Size of the RDD.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD (default: <cite>sc.defaultParallelism</cite>).</li>
<li><strong>seed</strong> – Random seed (default: a random long integer).</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of float comprised of i.i.d. samples ~ <cite>U(0.0, 1.0)</cite>.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">uniformRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="go">100</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">max</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mf">1.0</span> <span class="ow">and</span> <span class="nb">min</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="mf">0.0</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">RandomRDDs</span><span class="o">.</span><span class="n">uniformRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span>
<span class="go">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parts</span> <span class="o">=</span> <span class="n">RandomRDDs</span><span class="o">.</span><span class="n">uniformRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parts</span> <span class="o">==</span> <span class="n">sc</span><span class="o">.</span><span class="n">defaultParallelism</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.random.RandomRDDs.uniformVectorRDD">
<em class="property">static </em><code class="descname">uniformVectorRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>numRows</em>, <em>numCols</em>, <em>numPartitions=None</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/random.html#RandomRDDs.uniformVectorRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.random.RandomRDDs.uniformVectorRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the uniform distribution U(0.0, 1.0).</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – SparkContext used to create the RDD.</li>
<li><strong>numRows</strong> – Number of Vectors in the RDD.</li>
<li><strong>numCols</strong> – Number of elements in each Vector.</li>
<li><strong>numPartitions</strong> – Number of partitions in the RDD.</li>
<li><strong>seed</strong> – Seed for the RNG that generates the seed for the generator in each partition.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RDD of Vector with vectors containing i.i.d samples ~ <cite>U(0.0, 1.0)</cite>.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">matrix</span><span class="p">(</span><span class="n">RandomRDDs</span><span class="o">.</span><span class="n">uniformVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">shape</span>
<span class="go">(10, 10)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mat</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> <span class="o">&lt;=</span> <span class="mf">1.0</span> <span class="ow">and</span> <span class="n">mat</span><span class="o">.</span><span class="n">min</span><span class="p">()</span> <span class="o">&gt;=</span> <span class="mf">0.0</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">RandomRDDs</span><span class="o">.</span><span class="n">uniformVectorRDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.recommendation">
<span id="pyspark-mllib-recommendation-module"></span><h2>pyspark.mllib.recommendation module<a class="headerlink" href="#module-pyspark.mllib.recommendation" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.recommendation.</code><code class="descname">MatrixFactorizationModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel" title="Permalink to this definition"></a></dt>
<dd><p>A matrix factorisation model trained by regularized alternating
least-squares.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r3</span> <span class="o">=</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ratings</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">r1</span><span class="p">,</span> <span class="n">r2</span><span class="p">,</span> <span class="n">r3</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">trainImplicit</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">0.4...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">testset</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predictAll</span><span class="p">(</span><span class="n">testset</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Rating(user=1, product=1, rating=1.0...), Rating(user=1, product=2, rating=1.9...)]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">userFeatures</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[(1, array(&#39;d&#39;, [...])), (2, array(&#39;d&#39;, [...]))]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">recommendUsers</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">[Rating(user=2, product=1, rating=1.9...), Rating(user=1, product=1, rating=1.0...)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">recommendProducts</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">[Rating(user=1, product=2, rating=1.9...), Rating(user=1, product=1, rating=1.0...)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">rank</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">first_user</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">userFeatures</span><span class="p">()</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">latents</span> <span class="o">=</span> <span class="n">first_user</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">latents</span><span class="p">)</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">productFeatures</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[(1, array(&#39;d&#39;, [...])), (2, array(&#39;d&#39;, [...]))]</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">first_product</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">productFeatures</span><span class="p">()</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">latents</span> <span class="o">=</span> <span class="n">first_product</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">latents</span><span class="p">)</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">products_for_users</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">recommendProductsForUsers</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">products_for_users</span><span class="p">)</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">products_for_users</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">(1, (Rating(user=1, product=2, rating=...),))</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">users_for_products</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">recommendUsersForProducts</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">users_for_products</span><span class="p">)</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">users_for_products</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">(1, (Rating(user=2, product=1, rating=...),))</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">nonnegative</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">3.73...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span><span class="n">Rating</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="n">Rating</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="n">Rating</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">nonnegative</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">3.73...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">trainImplicit</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">nonnegative</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">0.4...</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">MatrixFactorizationModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="go">0.4...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predictAll</span><span class="p">(</span><span class="n">testset</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Rating(...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.1.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>user</em>, <em>product</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predicts rating for the given user and product.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.predictAll">
<code class="descname">predictAll</code><span class="sig-paren">(</span><em>user_product</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.predictAll"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.predictAll" title="Permalink to this definition"></a></dt>
<dd><p>Returns a list of predicted ratings for input user and product
pairs.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.productFeatures">
<code class="descname">productFeatures</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.productFeatures"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.productFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns a paired RDD, where the first element is the product and the
second is an array of features corresponding to that product.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.rank">
<code class="descname">rank</code><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.rank" title="Permalink to this definition"></a></dt>
<dd><p>Rank for the features in this model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.recommendProducts">
<code class="descname">recommendProducts</code><span class="sig-paren">(</span><em>user</em>, <em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.recommendProducts"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.recommendProducts" title="Permalink to this definition"></a></dt>
<dd><p>Recommends the top “num” number of products for a given user and
returns a list of Rating objects sorted by the predicted rating in
descending order.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.recommendProductsForUsers">
<code class="descname">recommendProductsForUsers</code><span class="sig-paren">(</span><em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.recommendProductsForUsers"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.recommendProductsForUsers" title="Permalink to this definition"></a></dt>
<dd><p>Recommends the top “num” number of products for all users. The
number of recommendations returned per user may be less than “num”.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.recommendUsers">
<code class="descname">recommendUsers</code><span class="sig-paren">(</span><em>product</em>, <em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.recommendUsers"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.recommendUsers" title="Permalink to this definition"></a></dt>
<dd><p>Recommends the top “num” number of users for a given product and
returns a list of Rating objects sorted by the predicted rating in
descending order.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.recommendUsersForProducts">
<code class="descname">recommendUsersForProducts</code><span class="sig-paren">(</span><em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.recommendUsersForProducts"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.recommendUsersForProducts" title="Permalink to this definition"></a></dt>
<dd><p>Recommends the top “num” number of users for all products. The
number of recommendations returned per product may be less than
“num”.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.recommendation.MatrixFactorizationModel.userFeatures">
<code class="descname">userFeatures</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#MatrixFactorizationModel.userFeatures"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.MatrixFactorizationModel.userFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns a paired RDD, where the first element is the user and the
second is an array of features corresponding to that user.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.recommendation.ALS">
<em class="property">class </em><code class="descclassname">pyspark.mllib.recommendation.</code><code class="descname">ALS</code><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#ALS"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.ALS" title="Permalink to this definition"></a></dt>
<dd><p>Alternating Least Squares matrix factorization</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.recommendation.ALS.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>ratings</em>, <em>rank</em>, <em>iterations=5</em>, <em>lambda_=0.01</em>, <em>blocks=-1</em>, <em>nonnegative=False</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#ALS.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.ALS.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a matrix factorization model given an RDD of ratings by users
for a subset of products. The ratings matrix is approximated as the
product of two lower-rank matrices of a given rank (number of
features). To solve for these features, ALS is run iteratively with
a configurable level of parallelism.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>ratings</strong> – RDD of <cite>Rating</cite> or (userID, productID, rating) tuple.</li>
<li><strong>rank</strong> – Number of features to use (also referred to as the number of latent factors).</li>
<li><strong>iterations</strong> – Number of iterations of ALS.
(default: 5)</li>
<li><strong>lambda</strong> – Regularization parameter.
(default: 0.01)</li>
<li><strong>blocks</strong> – Number of blocks used to parallelize the computation. A value
of -1 will use an auto-configured number of blocks.
(default: -1)</li>
<li><strong>nonnegative</strong> – A value of True will solve least-squares with nonnegativity
constraints.
(default: False)</li>
<li><strong>seed</strong> – Random seed for initial matrix factorization model. A value
of None will use system time as the seed.
(default: None)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.recommendation.ALS.trainImplicit">
<em class="property">classmethod </em><code class="descname">trainImplicit</code><span class="sig-paren">(</span><em>ratings</em>, <em>rank</em>, <em>iterations=5</em>, <em>lambda_=0.01</em>, <em>blocks=-1</em>, <em>alpha=0.01</em>, <em>nonnegative=False</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#ALS.trainImplicit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.ALS.trainImplicit" title="Permalink to this definition"></a></dt>
<dd><p>Train a matrix factorization model given an RDD of ‘implicit
preferences’ of users for a subset of products. The ratings matrix
is approximated as the product of two lower-rank matrices of a
given rank (number of features). To solve for these features, ALS
is run iteratively with a configurable level of parallelism.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>ratings</strong> – RDD of <cite>Rating</cite> or (userID, productID, rating) tuple.</li>
<li><strong>rank</strong> – Number of features to use (also referred to as the number of latent factors).</li>
<li><strong>iterations</strong> – Number of iterations of ALS.
(default: 5)</li>
<li><strong>lambda</strong> – Regularization parameter.
(default: 0.01)</li>
<li><strong>blocks</strong> – Number of blocks used to parallelize the computation. A value
of -1 will use an auto-configured number of blocks.
(default: -1)</li>
<li><strong>alpha</strong> – A constant used in computing confidence.
(default: 0.01)</li>
<li><strong>nonnegative</strong> – A value of True will solve least-squares with nonnegativity
constraints.
(default: False)</li>
<li><strong>seed</strong> – Random seed for initial matrix factorization model. A value
of None will use system time as the seed.
(default: None)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.recommendation.Rating">
<em class="property">class </em><code class="descclassname">pyspark.mllib.recommendation.</code><code class="descname">Rating</code><a class="reference internal" href="_modules/pyspark/mllib/recommendation.html#Rating"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.recommendation.Rating" title="Permalink to this definition"></a></dt>
<dd><p>Represents a (user, product, rating) tuple.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">r</span> <span class="o">=</span> <span class="n">Rating</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="p">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="p">,</span> <span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="p">)</span>
<span class="go">(1, 2, 5.0)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">r</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">r</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">r</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span>
<span class="go">(1, 2, 5.0)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.regression">
<span id="pyspark-mllib-regression-module"></span><h2>pyspark.mllib.regression module<a class="headerlink" href="#module-pyspark.mllib.regression" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.regression.LabeledPoint">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LabeledPoint</code><span class="sig-paren">(</span><em>label</em>, <em>features</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LabeledPoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LabeledPoint" title="Permalink to this definition"></a></dt>
<dd><p>Class that represents the features and labels of a data point.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>label</strong> – Label for this data point.</li>
<li><strong>features</strong> – Vector of features for this point (NumPy array, list,
pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">‘label’ and ‘features’ are accessible as class attributes.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.LinearModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LinearModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearModel" title="Permalink to this definition"></a></dt>
<dd><p>A linear model that has a vector of coefficients and an intercept.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>weights</strong> – Weights computed for every feature.</li>
<li><strong>intercept</strong> – Intercept computed for this model.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LinearModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.regression.LinearModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LinearModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.regression.LinearModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.LinearRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LinearRegressionModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>A linear regression model derived from a least-squares fit.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">LinearRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">3.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">miniBatchFraction</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),</span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> <span class="n">regType</span><span class="o">=</span><span class="s2">&quot;l2&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">intercept</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">validateData</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LinearRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.LinearRegressionModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearRegressionModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a LinearRegressionModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.LinearRegressionModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict the value of the dependent variable given a vector or
an RDD of vectors containing values for the independent variables.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.LinearRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearRegressionModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save a LinearRegressionModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LinearRegressionModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.LinearRegressionWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LinearRegressionWithSGD</code><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearRegressionWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionWithSGD" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.0.0. Use ml.regression.LinearRegression.</p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.LinearRegressionWithSGD.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>step=1.0</em>, <em>miniBatchFraction=1.0</em>, <em>initialWeights=None</em>, <em>regParam=0.0</em>, <em>regType=None</em>, <em>intercept=False</em>, <em>validateData=True</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LinearRegressionWithSGD.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LinearRegressionWithSGD.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a linear regression model using Stochastic Gradient
Descent (SGD). This solves the least squares regression
formulation</p>
<blockquote>
<div>f(weights) = 1/(2n) ||A weights - y||^2</div></blockquote>
<p>which is the mean squared error. Here the data matrix has n rows,
and the input RDD holds the set of rows of A, each with its
corresponding right hand side label y.
See also the documentation for the precise formulation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>step</strong> – The step parameter used in SGD.
(default: 1.0)</li>
<li><strong>miniBatchFraction</strong> – Fraction of data to be used for each SGD iteration.
(default: 1.0)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.0)</li>
<li><strong>regType</strong><p>The type of regularizer used for training our model.
Supported values:</p>
<blockquote>
<div><ul>
<li>”l1” for using L1 regularization</li>
<li>”l2” for using L2 regularization</li>
<li>None for no regularization (default)</li>
</ul>
</div></blockquote>
</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e., whether bias
features are activated or not).
(default: False)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>convergenceTol</strong> – A condition which decides iteration termination.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.RidgeRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">RidgeRegressionModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#RidgeRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>A linear regression model derived from a least-squares fit with
an l_2 penalty term.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">RidgeRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">RidgeRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">3.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">RidgeRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">miniBatchFraction</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),</span> <span class="n">intercept</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">validateData</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.regression.RidgeRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.RidgeRegressionModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#RidgeRegressionModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a RidgeRegressionMode.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.RidgeRegressionModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict the value of the dependent variable given a vector or
an RDD of vectors containing values for the independent variables.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.RidgeRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#RidgeRegressionModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save a RidgeRegressionMode.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.regression.RidgeRegressionModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.RidgeRegressionWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">RidgeRegressionWithSGD</code><a class="reference internal" href="_modules/pyspark/mllib/regression.html#RidgeRegressionWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionWithSGD" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0.
Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for
LinearRegression.</p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.RidgeRegressionWithSGD.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>step=1.0</em>, <em>regParam=0.01</em>, <em>miniBatchFraction=1.0</em>, <em>initialWeights=None</em>, <em>intercept=False</em>, <em>validateData=True</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#RidgeRegressionWithSGD.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.RidgeRegressionWithSGD.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a regression model with L2-regularization using Stochastic
Gradient Descent. This solves the l2-regularized least squares
regression formulation</p>
<blockquote>
<div>f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2</div></blockquote>
<p>Here the data matrix has n rows, and the input RDD holds the set
of rows of A, each with its corresponding right hand side label y.
See also the documentation for the precise formulation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>step</strong> – The step parameter used in SGD.
(default: 1.0)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.01)</li>
<li><strong>miniBatchFraction</strong> – Fraction of data to be used for each SGD iteration.
(default: 1.0)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e. whether bias
features are activated or not).
(default: False)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>convergenceTol</strong> – A condition which decides iteration termination.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.LassoModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LassoModel</code><span class="sig-paren">(</span><em>weights</em>, <em>intercept</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LassoModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LassoModel" title="Permalink to this definition"></a></dt>
<dd><p>A linear regression model derived from a least-squares fit with
an l_1 penalty term.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LassoWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">]]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">LassoModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">3.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lrm</span> <span class="o">=</span> <span class="n">LassoWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">miniBatchFraction</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">initialWeights</span><span class="o">=</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),</span> <span class="n">intercept</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">validateData</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span> <span class="o">-</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">lrm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.5</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LassoModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.mllib.regression.LassoModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Intercept computed for this model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.LassoModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LassoModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LassoModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a LassoModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.LassoModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.LassoModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict the value of the dependent variable given a vector or
an RDD of vectors containing values for the independent variables.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.LassoModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LassoModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LassoModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save a LassoModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.mllib.regression.LassoModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.mllib.regression.LassoModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weights computed for every feature.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.LassoWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">LassoWithSGD</code><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LassoWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LassoWithSGD" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0.
Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression.</p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.LassoWithSGD.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>iterations=100</em>, <em>step=1.0</em>, <em>regParam=0.01</em>, <em>miniBatchFraction=1.0</em>, <em>initialWeights=None</em>, <em>intercept=False</em>, <em>validateData=True</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#LassoWithSGD.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.LassoWithSGD.train" title="Permalink to this definition"></a></dt>
<dd><p>Train a regression model with L1-regularization using Stochastic
Gradient Descent. This solves the l1-regularized least squares
regression formulation</p>
<blockquote>
<div>f(weights) = 1/(2n) ||A weights - y||^2 + regParam ||weights||_1</div></blockquote>
<p>Here the data matrix has n rows, and the input RDD holds the set
of rows of A, each with its corresponding right hand side label y.
See also the documentation for the precise formulation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – The training data, an RDD of LabeledPoint.</li>
<li><strong>iterations</strong> – The number of iterations.
(default: 100)</li>
<li><strong>step</strong> – The step parameter used in SGD.
(default: 1.0)</li>
<li><strong>regParam</strong> – The regularizer parameter.
(default: 0.01)</li>
<li><strong>miniBatchFraction</strong> – Fraction of data to be used for each SGD iteration.
(default: 1.0)</li>
<li><strong>initialWeights</strong> – The initial weights.
(default: None)</li>
<li><strong>intercept</strong> – Boolean parameter which indicates the use or not of the
augmented representation for training data (i.e. whether bias
features are activated or not).
(default: False)</li>
<li><strong>validateData</strong> – Boolean parameter which indicates if the algorithm should
validate data before training.
(default: True)</li>
<li><strong>convergenceTol</strong> – A condition which decides iteration termination.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.9.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.IsotonicRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">IsotonicRegressionModel</code><span class="sig-paren">(</span><em>boundaries</em>, <em>predictions</em>, <em>isotonic</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Regression model for isotonic regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>boundaries</strong> – Array of boundaries for which predictions are known. Boundaries
must be sorted in increasing order.</li>
<li><strong>predictions</strong> – Array of predictions associated to the boundaries at the same
index. Results of isotonic regression and therefore monotone.</li>
<li><strong>isotonic</strong> – Indicates whether this is isotonic or antitonic.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">17</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">1</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span> <span class="o">=</span> <span class="n">IsotonicRegression</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="go">16.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">]))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[2.0, 16.5]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">tempfile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">irm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">IsotonicRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="go">16.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">shutil</span> <span class="k">import</span> <span class="n">rmtree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.IsotonicRegressionModel.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegressionModel.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load an IsotonicRegressionModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.IsotonicRegressionModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegressionModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegressionModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict labels for provided features.
Using a piecewise linear function.
1) If x exactly matches a boundary then associated prediction
is returned. In case there are multiple predictions with the
same boundary then one of them is returned. Which one is
undefined (same as java.util.Arrays.binarySearch).
2) If x is lower or higher than all boundaries then first or
last prediction is returned respectively. In case there are
multiple predictions with the same boundary then the lowest
or highest is returned respectively.
3) If x falls between two values in boundary array then
prediction is treated as piecewise linear function and
interpolated value is returned. In case there are multiple
values with the same boundary then the same rules as in 2)
are used.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – Feature or RDD of Features to be labeled.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.IsotonicRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegressionModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save an IsotonicRegressionModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.IsotonicRegression">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">IsotonicRegression</code><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegression" title="Permalink to this definition"></a></dt>
<dd><p>Isotonic regression.
Currently implemented using parallelized pool adjacent violators
algorithm. Only univariate (single feature) algorithm supported.</p>
<p>Sequential PAV implementation based on:</p>
<blockquote>
<div>Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
“Nearly-isotonic regression.” Technometrics 53.1 (2011): 54-61.
Available from <a class="reference external" href="http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf">http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf</a></div></blockquote>
<p>Sequential PAV parallelization based on:</p>
<blockquote>
<div>Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
“An approach to parallelizing isotonic regression.”
Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
Available from <a class="reference external" href="http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf">http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf</a></div></blockquote>
<p>See <a class="reference external" href="http://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression (Wikipedia)</a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.regression.IsotonicRegression.train">
<em class="property">classmethod </em><code class="descname">train</code><span class="sig-paren">(</span><em>data</em>, <em>isotonic=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#IsotonicRegression.train"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.IsotonicRegression.train" title="Permalink to this definition"></a></dt>
<dd><p>Train an isotonic regression model on the given data.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – RDD of (label, feature, weight) tuples.</li>
<li><strong>isotonic</strong> – Whether this is isotonic (which is default) or antitonic.
(default: True)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.StreamingLinearAlgorithm">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">StreamingLinearAlgorithm</code><span class="sig-paren">(</span><em>model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearAlgorithm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearAlgorithm" title="Permalink to this definition"></a></dt>
<dd><p>Base class that has to be inherited by any StreamingLinearAlgorithm.</p>
<p>Prevents reimplementation of methods predictOn and predictOnValues.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearAlgorithm.latestModel">
<code class="descname">latestModel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearAlgorithm.latestModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearAlgorithm.latestModel" title="Permalink to this definition"></a></dt>
<dd><p>Returns the latest model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearAlgorithm.predictOn">
<code class="descname">predictOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearAlgorithm.predictOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearAlgorithm.predictOn" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on batches of data from a
DStream.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing predictions.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearAlgorithm.predictOnValues">
<code class="descname">predictOnValues</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearAlgorithm.predictOnValues"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearAlgorithm.predictOnValues" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on the values of a DStream and
carry over its keys.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing the input keys and the predictions as values.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD">
<em class="property">class </em><code class="descclassname">pyspark.mllib.regression.</code><code class="descname">StreamingLinearRegressionWithSGD</code><span class="sig-paren">(</span><em>stepSize=0.1</em>, <em>numIterations=50</em>, <em>miniBatchFraction=1.0</em>, <em>convergenceTol=0.001</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearRegressionWithSGD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD" title="Permalink to this definition"></a></dt>
<dd><p>Train or predict a linear regression model on streaming data.
Training uses Stochastic Gradient Descent to update the model
based on each new batch of incoming data from a DStream
(see <cite>LinearRegressionWithSGD</cite> for model equation).</p>
<p>Each batch of data is assumed to be an RDD of LabeledPoints.
The number of data points per batch can vary, but the number
of features must be constant. An initial weight vector must
be provided.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>stepSize</strong> – Step size for each iteration of gradient descent.
(default: 0.1)</li>
<li><strong>numIterations</strong> – Number of iterations run for each batch of data.
(default: 50)</li>
<li><strong>miniBatchFraction</strong> – Fraction of each batch of data to use for updates.
(default: 1.0)</li>
<li><strong>convergenceTol</strong> – Value used to determine when to terminate iterations.
(default: 0.001)</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD.latestModel">
<code class="descname">latestModel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD.latestModel" title="Permalink to this definition"></a></dt>
<dd><p>Returns the latest model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD.predictOn">
<code class="descname">predictOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD.predictOn" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on batches of data from a
DStream.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing predictions.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD.predictOnValues">
<code class="descname">predictOnValues</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD.predictOnValues" title="Permalink to this definition"></a></dt>
<dd><p>Use the model to make predictions on the values of a DStream and
carry over its keys.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">DStream containing the input keys and the predictions as values.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD.setInitialWeights">
<code class="descname">setInitialWeights</code><span class="sig-paren">(</span><em>initialWeights</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearRegressionWithSGD.setInitialWeights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD.setInitialWeights" title="Permalink to this definition"></a></dt>
<dd><p>Set the initial value of weights.</p>
<p>This must be set before running trainOn and predictOn</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.regression.StreamingLinearRegressionWithSGD.trainOn">
<code class="descname">trainOn</code><span class="sig-paren">(</span><em>dstream</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/regression.html#StreamingLinearRegressionWithSGD.trainOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.regression.StreamingLinearRegressionWithSGD.trainOn" title="Permalink to this definition"></a></dt>
<dd><p>Train the model on the incoming dstream.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.stat">
<span id="pyspark-mllib-stat-module"></span><h2>pyspark.mllib.stat module<a class="headerlink" href="#module-pyspark.mllib.stat" title="Permalink to this headline"></a></h2>
<p>Python package for statistical functions in MLlib.</p>
<dl class="class">
<dt id="pyspark.mllib.stat.Statistics">
<em class="property">class </em><code class="descclassname">pyspark.mllib.stat.</code><code class="descname">Statistics</code><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#Statistics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.Statistics" title="Permalink to this definition"></a></dt>
<dd><dl class="staticmethod">
<dt id="pyspark.mllib.stat.Statistics.chiSqTest">
<em class="property">static </em><code class="descname">chiSqTest</code><span class="sig-paren">(</span><em>observed</em>, <em>expected=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#Statistics.chiSqTest"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.Statistics.chiSqTest" title="Permalink to this definition"></a></dt>
<dd><p>If <cite>observed</cite> is Vector, conduct Pearson’s chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of <cite>1 / len(observed)</cite>.</p>
<p>If <cite>observed</cite> is matrix, conduct Pearson’s independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.</p>
<p>If <cite>observed</cite> is an RDD of LabeledPoint, conduct Pearson’s independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last"><cite>observed</cite> cannot contain negative values</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>observed</strong> – it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.</li>
<li><strong>expected</strong> – Vector containing the expected categorical counts/relative
frequencies. <cite>expected</cite> is rescaled if the <cite>expected</cite> sum
differs from the <cite>observed</cite> sum.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span><span class="p">,</span> <span class="n">Matrices</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">observed</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">5</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearson</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">chiSqTest</span><span class="p">(</span><span class="n">observed</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">pearson</span><span class="o">.</span><span class="n">statistic</span><span class="p">)</span>
<span class="go">0.4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearson</span><span class="o">.</span><span class="n">degreesOfFreedom</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">pearson</span><span class="o">.</span><span class="n">pValue</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
<span class="go">0.8187</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearson</span><span class="o">.</span><span class="n">method</span>
<span class="go">&#39;pearson&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearson</span><span class="o">.</span><span class="n">nullHypothesis</span>
<span class="go">&#39;observed follows the same distribution as expected.&#39;</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">observed</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">21</span><span class="p">,</span> <span class="mi">38</span><span class="p">,</span> <span class="mi">43</span><span class="p">,</span> <span class="mi">80</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expected</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">20</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearson</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">chiSqTest</span><span class="p">(</span><span class="n">observed</span><span class="p">,</span> <span class="n">expected</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">pearson</span><span class="o">.</span><span class="n">pValue</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
<span class="go">0.0027</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span><span class="mf">40.0</span><span class="p">,</span> <span class="mf">24.0</span><span class="p">,</span> <span class="mf">29.0</span><span class="p">,</span> <span class="mf">56.0</span><span class="p">,</span> <span class="mf">32.0</span><span class="p">,</span> <span class="mf">42.0</span><span class="p">,</span> <span class="mf">31.0</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">30.0</span><span class="p">,</span> <span class="mf">15.0</span><span class="p">,</span> <span class="mf">12.0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">chi</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">chiSqTest</span><span class="p">(</span><span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="n">data</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">chi</span><span class="o">.</span><span class="n">statistic</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
<span class="go">21.9958</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span><span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.5</span><span class="p">,</span> <span class="mf">20.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.5</span><span class="p">,</span> <span class="mf">30.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.5</span><span class="p">,</span> <span class="mf">30.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.5</span><span class="p">,</span> <span class="mf">40.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.5</span><span class="p">,</span> <span class="mf">40.0</span><span class="p">])),]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">chi</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">chiSqTest</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">chi</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">statistic</span><span class="p">)</span>
<span class="go">0.75</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">chi</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">statistic</span><span class="p">)</span>
<span class="go">1.5</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.stat.Statistics.colStats">
<em class="property">static </em><code class="descname">colStats</code><span class="sig-paren">(</span><em>rdd</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#Statistics.colStats"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.Statistics.colStats" title="Permalink to this definition"></a></dt>
<dd><p>Computes column-wise summary statistics for the input RDD[Vector].</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rdd</strong> – an RDD[Vector] for which column-wise summary statistics
are to be computed.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><a class="reference internal" href="#pyspark.mllib.stat.MultivariateStatisticalSummary" title="pyspark.mllib.stat.MultivariateStatisticalSummary"><code class="xref py py-class docutils literal"><span class="pre">MultivariateStatisticalSummary</span></code></a> object containing
column-wise summary statistics.</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mi">2</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">6</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">colStats</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="go">array([ 4., 4., 0., 3.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">variance</span><span class="p">()</span>
<span class="go">array([ 4., 13., 0., 25.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">numNonzeros</span><span class="p">()</span>
<span class="go">array([ 3., 2., 0., 3.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
<span class="go">array([ 6., 7., 0., 8.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cStats</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
<span class="go">array([ 2., 0., 0., -2.])</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.stat.Statistics.corr">
<em class="property">static </em><code class="descname">corr</code><span class="sig-paren">(</span><em>x</em>, <em>y=None</em>, <em>method=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#Statistics.corr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.Statistics.corr" title="Permalink to this definition"></a></dt>
<dd><p>Compute the correlation (matrix) for the input RDD(s) using the
specified method.
Methods currently supported: <cite>pearson (default), spearman</cite>.</p>
<p>If a single RDD of Vectors is passed in, a correlation matrix
comparing the columns in the input RDD is returned. Use <cite>method=</cite>
to specify the method to be used for single RDD inout.
If two RDDs of floats are passed in, a single float is returned.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>x</strong> – an RDD of vector for which the correlation matrix is to be computed,
or an RDD of float of the same cardinality as y when y is specified.</li>
<li><strong>y</strong> – an RDD of float of the same cardinality as x.</li>
<li><strong>method</strong> – String specifying the method to use for computing correlation.
Supported: <cite>pearson</cite> (default), <cite>spearman</cite></li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">Correlation matrix comparing columns in x.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="p">],</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">y</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">],</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">zeros</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">],</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span> <span class="o">-</span> <span class="mf">0.6546537</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">1e-7</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span> <span class="o">==</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="s2">&quot;pearson&quot;</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="s2">&quot;spearman&quot;</span><span class="p">)</span>
<span class="go">0.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">math</span> <span class="k">import</span> <span class="n">isnan</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">isnan</span><span class="p">(</span><span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">zeros</span><span class="p">))</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mi">2</span><span class="p">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">6</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">9</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearsonCorr</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">pearsonCorr</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;nan&#39;</span><span class="p">,</span> <span class="s1">&#39;NaN&#39;</span><span class="p">))</span>
<span class="go">[[ 1. 0.05564149 NaN 0.40047142]</span>
<span class="go"> [ 0.05564149 1. NaN 0.91359586]</span>
<span class="go"> [ NaN NaN 1. NaN]</span>
<span class="go"> [ 0.40047142 0.91359586 NaN 1. ]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spearmanCorr</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s2">&quot;spearman&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">spearmanCorr</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;nan&#39;</span><span class="p">,</span> <span class="s1">&#39;NaN&#39;</span><span class="p">))</span>
<span class="go">[[ 1. 0.10540926 NaN 0.4 ]</span>
<span class="go"> [ 0.10540926 1. NaN 0.9486833 ]</span>
<span class="go"> [ NaN NaN 1. NaN]</span>
<span class="go"> [ 0.4 0.9486833 NaN 1. ]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">try</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">Statistics</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="s2">&quot;spearman&quot;</span><span class="p">)</span>
<span class="gp">... </span> <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Method name as second argument without &#39;method=&#39; shouldn&#39;t be allowed.&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">pass</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.stat.Statistics.kolmogorovSmirnovTest">
<em class="property">static </em><code class="descname">kolmogorovSmirnovTest</code><span class="sig-paren">(</span><em>data</em>, <em>distName='norm'</em>, <em>*params</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#Statistics.kolmogorovSmirnovTest"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.Statistics.kolmogorovSmirnovTest" title="Permalink to this definition"></a></dt>
<dd><p>Performs the Kolmogorov-Smirnov (KS) test for data sampled from
a continuous distribution. It tests the null hypothesis that
the data is generated from a particular distribution.</p>
<p>The given data is sorted and the Empirical Cumulative
Distribution Function (ECDF) is calculated
which for a given point is the number of points having a CDF
value lesser than it divided by the total number of points.</p>
<p>Since the data is sorted, this is a step function
that rises by (1 / length of data) for every ordered point.</p>
<p>The KS statistic gives us the maximum distance between the
ECDF and the CDF. Intuitively if this statistic is large, the
probabilty that the null hypothesis is true becomes small.
For specific details of the implementation, please have a look
at the Scala documentation.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – RDD, samples from the data</li>
<li><strong>distName</strong> – string, currently only “norm” is supported.
(Normal distribution) to calculate the
theoretical distribution of the data.</li>
<li><strong>params</strong> – additional values which need to be provided for
a certain distribution.
If not provided, the default values are used.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">KolmogorovSmirnovTestResult object containing the test
statistic, degrees of freedom, p-value,
the method used, and the null hypothesis.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">kstest</span> <span class="o">=</span> <span class="n">Statistics</span><span class="o">.</span><span class="n">kolmogorovSmirnovTest</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ksmodel</span> <span class="o">=</span> <span class="n">kstest</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="s2">&quot;norm&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">ksmodel</span><span class="o">.</span><span class="n">pValue</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">ksmodel</span><span class="o">.</span><span class="n">statistic</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">0.175</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ksmodel</span><span class="o">.</span><span class="n">nullHypothesis</span>
<span class="go">&#39;Sample follows theoretical distribution&#39;</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ksmodel</span> <span class="o">=</span> <span class="n">kstest</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="s2">&quot;norm&quot;</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">ksmodel</span><span class="o">.</span><span class="n">pValue</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">ksmodel</span><span class="o">.</span><span class="n">statistic</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">0.175</span>
</pre></div>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary">
<em class="property">class </em><code class="descclassname">pyspark.mllib.stat.</code><code class="descname">MultivariateStatisticalSummary</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary" title="Permalink to this definition"></a></dt>
<dd><p>Trait for multivariate statistical summary of a data matrix.</p>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.count">
<code class="descname">count</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.count"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.count" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.max">
<code class="descname">max</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.max"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.max" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.mean">
<code class="descname">mean</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.mean"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.mean" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.min">
<code class="descname">min</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.min"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.min" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.normL1">
<code class="descname">normL1</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.normL1"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.normL1" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.normL2">
<code class="descname">normL2</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.normL2"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.normL2" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.numNonzeros">
<code class="descname">numNonzeros</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.numNonzeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.numNonzeros" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.MultivariateStatisticalSummary.variance">
<code class="descname">variance</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/_statistics.html#MultivariateStatisticalSummary.variance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateStatisticalSummary.variance" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.stat.ChiSqTestResult">
<em class="property">class </em><code class="descclassname">pyspark.mllib.stat.</code><code class="descname">ChiSqTestResult</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/test.html#ChiSqTestResult"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.ChiSqTestResult" title="Permalink to this definition"></a></dt>
<dd><p>Contains test results for the chi-squared hypothesis test.</p>
<dl class="attribute">
<dt id="pyspark.mllib.stat.ChiSqTestResult.method">
<code class="descname">method</code><a class="headerlink" href="#pyspark.mllib.stat.ChiSqTestResult.method" title="Permalink to this definition"></a></dt>
<dd><p>Name of the test method</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.stat.MultivariateGaussian">
<em class="property">class </em><code class="descclassname">pyspark.mllib.stat.</code><code class="descname">MultivariateGaussian</code><a class="reference internal" href="_modules/pyspark/mllib/stat/distribution.html#MultivariateGaussian"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.MultivariateGaussian" title="Permalink to this definition"></a></dt>
<dd><p>Represents a (mu, sigma) tuple</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">m</span> <span class="o">=</span> <span class="n">MultivariateGaussian</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">11</span><span class="p">,</span><span class="mi">12</span><span class="p">]),</span><span class="n">DenseMatrix</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">m</span><span class="o">.</span><span class="n">mu</span><span class="p">,</span> <span class="n">m</span><span class="o">.</span><span class="n">sigma</span><span class="o">.</span><span class="n">toArray</span><span class="p">())</span>
<span class="go">(DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">m</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">m</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="go">(DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]]))</span>
</pre></div>
</div>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.stat.KernelDensity">
<em class="property">class </em><code class="descclassname">pyspark.mllib.stat.</code><code class="descname">KernelDensity</code><a class="reference internal" href="_modules/pyspark/mllib/stat/KernelDensity.html#KernelDensity"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.KernelDensity" title="Permalink to this definition"></a></dt>
<dd><p>Estimate probability density at required points given an RDD of samples
from the population.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">kd</span> <span class="o">=</span> <span class="n">KernelDensity</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sample</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kd</span><span class="o">.</span><span class="n">setSample</span><span class="p">(</span><span class="n">sample</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kd</span><span class="o">.</span><span class="n">estimate</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">array([ 0.12938758, 0.12938758])</span>
</pre></div>
</div>
<dl class="method">
<dt id="pyspark.mllib.stat.KernelDensity.estimate">
<code class="descname">estimate</code><span class="sig-paren">(</span><em>points</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/KernelDensity.html#KernelDensity.estimate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.KernelDensity.estimate" title="Permalink to this definition"></a></dt>
<dd><p>Estimate the probability density at points</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.KernelDensity.setBandwidth">
<code class="descname">setBandwidth</code><span class="sig-paren">(</span><em>bandwidth</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/KernelDensity.html#KernelDensity.setBandwidth"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.KernelDensity.setBandwidth" title="Permalink to this definition"></a></dt>
<dd><p>Set bandwidth of each sample. Defaults to 1.0</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.stat.KernelDensity.setSample">
<code class="descname">setSample</code><span class="sig-paren">(</span><em>sample</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/stat/KernelDensity.html#KernelDensity.setSample"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.stat.KernelDensity.setSample" title="Permalink to this definition"></a></dt>
<dd><p>Set sample points from the population. Should be a RDD</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.tree">
<span id="pyspark-mllib-tree-module"></span><h2>pyspark.mllib.tree module<a class="headerlink" href="#module-pyspark.mllib.tree" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.tree.DecisionTreeModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">DecisionTreeModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTreeModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel" title="Permalink to this definition"></a></dt>
<dd><p>A decision tree model for classification or regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.call">
<code class="descname">call</code><span class="sig-paren">(</span><em>name</em>, <em>*a</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.call" title="Permalink to this definition"></a></dt>
<dd><p>Call method of java_model</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.depth">
<code class="descname">depth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTreeModel.depth"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.depth" title="Permalink to this definition"></a></dt>
<dd><p>Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.numNodes">
<code class="descname">numNodes</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTreeModel.numNodes"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.numNodes" title="Permalink to this definition"></a></dt>
<dd><p>Get number of nodes in tree, including leaf nodes.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTreeModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict the label of one or more examples.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>x</strong> – Data point (feature vector), or an RDD of data points (feature
vectors).</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.DecisionTreeModel.toDebugString">
<code class="descname">toDebugString</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTreeModel.toDebugString"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTreeModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>full model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.tree.DecisionTree">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">DecisionTree</code><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTree"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTree" title="Permalink to this definition"></a></dt>
<dd><p>Learning algorithm for a decision tree model for classification or
regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.DecisionTree.trainClassifier">
<em class="property">classmethod </em><code class="descname">trainClassifier</code><span class="sig-paren">(</span><em>data</em>, <em>numClasses</em>, <em>categoricalFeaturesInfo</em>, <em>impurity='gini'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTree.trainClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTree.trainClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Train a decision tree model for classification.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training data: RDD of LabeledPoint. Labels should take values
{0, 1, …, numClasses-1}.</li>
<li><strong>numClasses</strong> – Number of classes for classification.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>impurity</strong> – Criterion used for information gain calculation.
Supported values: “gini” or “entropy”.
(default: “gini”)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 5)</li>
<li><strong>maxBins</strong> – Number of bins used for finding splits at each node.
(default: 32)</li>
<li><strong>minInstancesPerNode</strong> – Minimum number of instances required at child nodes to create
the parent split.
(default: 1)</li>
<li><strong>minInfoGain</strong> – Minimum info gain required to create a split.
(default: 0.0)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">DecisionTreeModel.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy</span> <span class="k">import</span> <span class="n">array</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">DecisionTree</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">DecisionTree</span><span class="o">.</span><span class="n">trainClassifier</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="p">{})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="go">DecisionTreeModel classifier of depth 1 with 3 nodes</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">toDebugString</span><span class="p">())</span>
<span class="go">DecisionTreeModel classifier of depth 1 with 3 nodes</span>
<span class="go"> If (feature 0 &lt;= 0.0)</span>
<span class="go"> Predict: 0.0</span>
<span class="go"> Else (feature 0 &gt; 0.0)</span>
<span class="go"> Predict: 1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]))</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.0]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.DecisionTree.trainRegressor">
<em class="property">classmethod </em><code class="descname">trainRegressor</code><span class="sig-paren">(</span><em>data</em>, <em>categoricalFeaturesInfo</em>, <em>impurity='variance'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#DecisionTree.trainRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.DecisionTree.trainRegressor" title="Permalink to this definition"></a></dt>
<dd><p>Train a decision tree model for regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training data: RDD of LabeledPoint. Labels are real numbers.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>impurity</strong> – Criterion used for information gain calculation.
The only supported value for regression is “variance”.
(default: “variance”)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 5)</li>
<li><strong>maxBins</strong> – Number of bins used for finding splits at each node.
(default: 32)</li>
<li><strong>minInstancesPerNode</strong> – Minimum number of instances required at child nodes to create
the parent split.
(default: 1)</li>
<li><strong>minInfoGain</strong> – Minimum info gain required to create a split.
(default: 0.0)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">DecisionTreeModel.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">DecisionTree</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">SparseVector</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">DecisionTree</span><span class="o">.</span><span class="n">trainRegressor</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">),</span> <span class="p">{})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">0.0</span><span class="p">}))</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.0]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.tree.RandomForestModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">RandomForestModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#RandomForestModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel" title="Permalink to this definition"></a></dt>
<dd><p>Represents a random forest model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.call">
<code class="descname">call</code><span class="sig-paren">(</span><em>name</em>, <em>*a</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.call" title="Permalink to this definition"></a></dt>
<dd><p>Call method of java_model</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.numTrees">
<code class="descname">numTrees</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.numTrees" title="Permalink to this definition"></a></dt>
<dd><p>Get number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict values for a single data point or an RDD of points using
the model trained.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.toDebugString">
<code class="descname">toDebugString</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.RandomForestModel.totalNumNodes">
<code class="descname">totalNumNodes</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.RandomForestModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Get total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.tree.RandomForest">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">RandomForest</code><a class="reference internal" href="_modules/pyspark/mllib/tree.html#RandomForest"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.RandomForest" title="Permalink to this definition"></a></dt>
<dd><p>Learning algorithm for a random forest model for classification or
regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.mllib.tree.RandomForest.supportedFeatureSubsetStrategies">
<code class="descname">supportedFeatureSubsetStrategies</code><em class="property"> = ('auto', 'all', 'sqrt', 'log2', 'onethird')</em><a class="headerlink" href="#pyspark.mllib.tree.RandomForest.supportedFeatureSubsetStrategies" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.RandomForest.trainClassifier">
<em class="property">classmethod </em><code class="descname">trainClassifier</code><span class="sig-paren">(</span><em>data</em>, <em>numClasses</em>, <em>categoricalFeaturesInfo</em>, <em>numTrees</em>, <em>featureSubsetStrategy='auto'</em>, <em>impurity='gini'</em>, <em>maxDepth=4</em>, <em>maxBins=32</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#RandomForest.trainClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.RandomForest.trainClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Train a random forest model for binary or multiclass
classification.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training dataset: RDD of LabeledPoint. Labels should take values
{0, 1, …, numClasses-1}.</li>
<li><strong>numClasses</strong> – Number of classes for classification.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>numTrees</strong> – Number of trees in the random forest.</li>
<li><strong>featureSubsetStrategy</strong> – Number of features to consider for splits at each node.
Supported values: “auto”, “all”, “sqrt”, “log2”, “onethird”.
If “auto” is set, this parameter is set based on numTrees:
if numTrees == 1, set to “all”;
if numTrees &gt; 1 (forest) set to “sqrt”.
(default: “auto”)</li>
<li><strong>impurity</strong> – Criterion used for information gain calculation.
Supported values: “gini” or “entropy”.
(default: “gini”)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 4)</li>
<li><strong>maxBins</strong> – Maximum number of bins used for splitting features.
(default: 32)</li>
<li><strong>seed</strong> – Random seed for bootstrapping and choosing feature subsets.
Set as None to generate seed based on system time.
(default: None)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RandomForestModel that can be used for prediction.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">RandomForest</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">RandomForest</span><span class="o">.</span><span class="n">trainClassifier</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="p">{},</span> <span class="mi">3</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numTrees</span><span class="p">()</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">totalNumNodes</span><span class="p">()</span>
<span class="go">7</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="go">TreeEnsembleModel classifier with 3 trees</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">toDebugString</span><span class="p">())</span>
<span class="go">TreeEnsembleModel classifier with 3 trees</span>
<span class="go"> Tree 0:</span>
<span class="go"> Predict: 1.0</span>
<span class="go"> Tree 1:</span>
<span class="go"> If (feature 0 &lt;= 1.0)</span>
<span class="go"> Predict: 0.0</span>
<span class="go"> Else (feature 0 &gt; 1.0)</span>
<span class="go"> Predict: 1.0</span>
<span class="go"> Tree 2:</span>
<span class="go"> If (feature 0 &lt;= 1.0)</span>
<span class="go"> Predict: 0.0</span>
<span class="go"> Else (feature 0 &gt; 1.0)</span>
<span class="go"> Predict: 1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">2.0</span><span class="p">])</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">])</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">3.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.0]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.RandomForest.trainRegressor">
<em class="property">classmethod </em><code class="descname">trainRegressor</code><span class="sig-paren">(</span><em>data</em>, <em>categoricalFeaturesInfo</em>, <em>numTrees</em>, <em>featureSubsetStrategy='auto'</em>, <em>impurity='variance'</em>, <em>maxDepth=4</em>, <em>maxBins=32</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#RandomForest.trainRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.RandomForest.trainRegressor" title="Permalink to this definition"></a></dt>
<dd><p>Train a random forest model for regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training dataset: RDD of LabeledPoint. Labels are real numbers.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>numTrees</strong> – Number of trees in the random forest.</li>
<li><strong>featureSubsetStrategy</strong> – Number of features to consider for splits at each node.
Supported values: “auto”, “all”, “sqrt”, “log2”, “onethird”.
If “auto” is set, this parameter is set based on numTrees:
if numTrees == 1, set to “all”;
if numTrees &gt; 1 (forest) set to “onethird” for regression.
(default: “auto”)</li>
<li><strong>impurity</strong> – Criterion used for information gain calculation.
The only supported value for regression is “variance”.
(default: “variance”)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 4)</li>
<li><strong>maxBins</strong> – Maximum number of bins used for splitting features.
(default: 32)</li>
<li><strong>seed</strong> – Random seed for bootstrapping and choosing feature subsets.
Set as None to generate seed based on system time.
(default: None)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">RandomForestModel that can be used for prediction.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">RandomForest</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">SparseVector</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">RandomForest</span><span class="o">.</span><span class="n">trainRegressor</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">),</span> <span class="p">{},</span> <span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numTrees</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">totalNumNodes</span><span class="p">()</span>
<span class="go">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.5]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">GradientBoostedTreesModel</code><span class="sig-paren">(</span><em>java_model</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#GradientBoostedTreesModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel" title="Permalink to this definition"></a></dt>
<dd><p>Represents a gradient-boosted tree model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.call">
<code class="descname">call</code><span class="sig-paren">(</span><em>name</em>, <em>*a</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.call" title="Permalink to this definition"></a></dt>
<dd><p>Call method of java_model</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.numTrees">
<code class="descname">numTrees</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.numTrees" title="Permalink to this definition"></a></dt>
<dd><p>Get number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>x</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predict values for a single data point or an RDD of points using
the model trained.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.toDebugString">
<code class="descname">toDebugString</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.mllib.tree.GradientBoostedTreesModel.totalNumNodes">
<code class="descname">totalNumNodes</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTreesModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Get total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.tree.GradientBoostedTrees">
<em class="property">class </em><code class="descclassname">pyspark.mllib.tree.</code><code class="descname">GradientBoostedTrees</code><a class="reference internal" href="_modules/pyspark/mllib/tree.html#GradientBoostedTrees"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTrees" title="Permalink to this definition"></a></dt>
<dd><p>Learning algorithm for a gradient boosted trees model for
classification or regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.GradientBoostedTrees.trainClassifier">
<em class="property">classmethod </em><code class="descname">trainClassifier</code><span class="sig-paren">(</span><em>data</em>, <em>categoricalFeaturesInfo</em>, <em>loss='logLoss'</em>, <em>numIterations=100</em>, <em>learningRate=0.1</em>, <em>maxDepth=3</em>, <em>maxBins=32</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#GradientBoostedTrees.trainClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTrees.trainClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Train a gradient-boosted trees model for classification.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training dataset: RDD of LabeledPoint. Labels should take values
{0, 1}.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>loss</strong> – Loss function used for minimization during gradient boosting.
Supported values: “logLoss”, “leastSquaresError”,
“leastAbsoluteError”.
(default: “logLoss”)</li>
<li><strong>numIterations</strong> – Number of iterations of boosting.
(default: 100)</li>
<li><strong>learningRate</strong> – Learning rate for shrinking the contribution of each estimator.
The learning rate should be between in the interval (0, 1].
(default: 0.1)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 3)</li>
<li><strong>maxBins</strong> – Maximum number of bins used for splitting features. DecisionTree
requires maxBins &gt;= max categories.
(default: 32)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">GradientBoostedTreesModel that can be used for prediction.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">GradientBoostedTrees</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">])</span>
<span class="gp">... </span><span class="p">]</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">GradientBoostedTrees</span><span class="o">.</span><span class="n">trainClassifier</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="p">{},</span> <span class="n">numIterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numTrees</span><span class="p">()</span>
<span class="go">10</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">totalNumNodes</span><span class="p">()</span>
<span class="go">30</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="p">)</span> <span class="c1"># it already has newline</span>
<span class="go">TreeEnsembleModel classifier with 10 trees</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">2.0</span><span class="p">])</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">])</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">2.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.0]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.mllib.tree.GradientBoostedTrees.trainRegressor">
<em class="property">classmethod </em><code class="descname">trainRegressor</code><span class="sig-paren">(</span><em>data</em>, <em>categoricalFeaturesInfo</em>, <em>loss='leastSquaresError'</em>, <em>numIterations=100</em>, <em>learningRate=0.1</em>, <em>maxDepth=3</em>, <em>maxBins=32</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/tree.html#GradientBoostedTrees.trainRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.tree.GradientBoostedTrees.trainRegressor" title="Permalink to this definition"></a></dt>
<dd><p>Train a gradient-boosted trees model for regression.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>data</strong> – Training dataset: RDD of LabeledPoint. Labels are real numbers.</li>
<li><strong>categoricalFeaturesInfo</strong> – Map storing arity of categorical features. An entry (n -&gt; k)
indicates that feature n is categorical with k categories
indexed from 0: {0, 1, …, k-1}.</li>
<li><strong>loss</strong> – Loss function used for minimization during gradient boosting.
Supported values: “logLoss”, “leastSquaresError”,
“leastAbsoluteError”.
(default: “leastSquaresError”)</li>
<li><strong>numIterations</strong> – Number of iterations of boosting.
(default: 100)</li>
<li><strong>learningRate</strong> – Learning rate for shrinking the contribution of each estimator.
The learning rate should be between in the interval (0, 1].
(default: 0.1)</li>
<li><strong>maxDepth</strong> – Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
means 1 internal node + 2 leaf nodes).
(default: 3)</li>
<li><strong>maxBins</strong> – Maximum number of bins used for splitting features. DecisionTree
requires maxBins &gt;= max categories.
(default: 32)</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">GradientBoostedTreesModel that can be used for prediction.</p>
</td>
</tr>
</tbody>
</table>
<p>Example usage:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.tree</span> <span class="k">import</span> <span class="n">GradientBoostedTrees</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">SparseVector</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse_data</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">}))</span>
<span class="gp">... </span><span class="p">]</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">sparse_data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">GradientBoostedTrees</span><span class="o">.</span><span class="n">trainRegressor</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">{},</span> <span class="n">numIterations</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numTrees</span><span class="p">()</span>
<span class="go">10</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">totalNumNodes</span><span class="p">()</span>
<span class="go">12</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">}))</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rdd</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">rdd</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[1.0, 0.0]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.mllib.util">
<span id="pyspark-mllib-util-module"></span><h2>pyspark.mllib.util module<a class="headerlink" href="#module-pyspark.mllib.util" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.mllib.util.JavaLoader">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">JavaLoader</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#JavaLoader"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.JavaLoader" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for classes which can load saved models using its Scala
implementation.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.util.JavaLoader.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#JavaLoader.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.JavaLoader.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.util.JavaSaveable">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">JavaSaveable</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#JavaSaveable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.JavaSaveable" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for models that provide save() through their Scala
implementation.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.util.JavaSaveable.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#JavaSaveable.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.JavaSaveable.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.util.LinearDataGenerator">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">LinearDataGenerator</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#LinearDataGenerator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.LinearDataGenerator" title="Permalink to this definition"></a></dt>
<dd><p>Utils for generating linear data.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.LinearDataGenerator.generateLinearInput">
<em class="property">static </em><code class="descname">generateLinearInput</code><span class="sig-paren">(</span><em>intercept</em>, <em>weights</em>, <em>xMean</em>, <em>xVariance</em>, <em>nPoints</em>, <em>seed</em>, <em>eps</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#LinearDataGenerator.generateLinearInput"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.LinearDataGenerator.generateLinearInput" title="Permalink to this definition"></a></dt>
<dd><table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">intercept bias factor, the term c in X’w + c</td>
</tr>
<tr class="field-even field"><th class="field-name">Param:</th><td class="field-body">weights feature vector, the term w in X’w + c</td>
</tr>
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">xMean Point around which the data X is centered.</td>
</tr>
<tr class="field-even field"><th class="field-name">Param:</th><td class="field-body">xVariance Variance of the given data</td>
</tr>
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">nPoints Number of points to be generated</td>
</tr>
<tr class="field-even field"><th class="field-name">Param:</th><td class="field-body">seed Random Seed</td>
</tr>
<tr class="field-odd field"><th class="field-name">Param:</th><td class="field-body">eps Used to scale the noise. If eps is set high,
the amount of gaussian noise added is more.</td>
</tr>
</tbody>
</table>
<p>Returns a list of LabeledPoints of length nPoints</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.LinearDataGenerator.generateLinearRDD">
<em class="property">static </em><code class="descname">generateLinearRDD</code><span class="sig-paren">(</span><em>sc</em>, <em>nexamples</em>, <em>nfeatures</em>, <em>eps</em>, <em>nParts=2</em>, <em>intercept=0.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#LinearDataGenerator.generateLinearRDD"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.LinearDataGenerator.generateLinearRDD" title="Permalink to this definition"></a></dt>
<dd><p>Generate an RDD of LabeledPoints.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.util.Loader">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">Loader</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#Loader"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.Loader" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for classes which can load saved models from files.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.mllib.util.Loader.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#Loader.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.Loader.load" title="Permalink to this definition"></a></dt>
<dd><p>Load a model from the given path. The model should have been
saved using py:meth:<cite>Saveable.save</cite>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – Spark context used for loading model files.</li>
<li><strong>path</strong> – Path specifying the directory to which the model
was saved.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">model instance</p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.util.MLUtils">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">MLUtils</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils" title="Permalink to this definition"></a></dt>
<dd><p>Helper methods to load, save and pre-process data used in MLlib.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.appendBias">
<em class="property">static </em><code class="descname">appendBias</code><span class="sig-paren">(</span><em>data</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.appendBias"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.appendBias" title="Permalink to this definition"></a></dt>
<dd><p>Returns a new vector with <cite>1.0</cite> (bias) appended to
the end of the input vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.convertMatrixColumnsFromML">
<em class="property">static </em><code class="descname">convertMatrixColumnsFromML</code><span class="sig-paren">(</span><em>dataset</em>, <em>*cols</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.convertMatrixColumnsFromML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.convertMatrixColumnsFromML" title="Permalink to this definition"></a></dt>
<dd><p>Converts matrix columns in an input DataFrame to the
<a class="reference internal" href="#pyspark.mllib.linalg.Matrix" title="pyspark.mllib.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Matrix</span></code></a> type from the new
<a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Matrix" title="pyspark.ml.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Matrix</span></code></a> type under the <cite>spark.ml</cite>
package.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset</li>
<li><strong>cols</strong> – a list of matrix columns to be converted.
Old matrix columns will be ignored. If unspecified, all new
matrix columns will be converted except nested ones.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the input dataset with new matrix columns converted to the
old matrix type</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Matrices</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">)))],</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertMatrixColumnsFromML</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertMatrixColumnsFromML</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseMatrix</span><span class="p">)</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.convertMatrixColumnsToML">
<em class="property">static </em><code class="descname">convertMatrixColumnsToML</code><span class="sig-paren">(</span><em>dataset</em>, <em>*cols</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.convertMatrixColumnsToML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.convertMatrixColumnsToML" title="Permalink to this definition"></a></dt>
<dd><p>Converts matrix columns in an input DataFrame from the
<a class="reference internal" href="#pyspark.mllib.linalg.Matrix" title="pyspark.mllib.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Matrix</span></code></a> type to the new
<a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Matrix" title="pyspark.ml.linalg.Matrix"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Matrix</span></code></a> type under the <cite>spark.ml</cite>
package.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset</li>
<li><strong>cols</strong> – a list of matrix columns to be converted.
New matrix columns will be ignored. If unspecified, all old
matrix columns will be converted excepted nested ones.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the input dataset with old matrix columns converted to the
new matrix type</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Matrices</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Matrices</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">Matrices</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">)))],</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertMatrixColumnsToML</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertMatrixColumnsToML</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseMatrix</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseMatrix</span><span class="p">)</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.convertVectorColumnsFromML">
<em class="property">static </em><code class="descname">convertVectorColumnsFromML</code><span class="sig-paren">(</span><em>dataset</em>, <em>*cols</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.convertVectorColumnsFromML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.convertVectorColumnsFromML" title="Permalink to this definition"></a></dt>
<dd><p>Converts vector columns in an input DataFrame to the
<a class="reference internal" href="#pyspark.mllib.linalg.Vector" title="pyspark.mllib.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Vector</span></code></a> type from the new
<a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Vector" title="pyspark.ml.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Vector</span></code></a> type under the <cite>spark.ml</cite>
package.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset</li>
<li><strong>cols</strong> – a list of vector columns to be converted.
Old vector columns will be ignored. If unspecified, all new
vector columns will be converted except nested ones.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the input dataset with new vector columns converted to the
old vector type</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">))],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertVectorColumnsFromML</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertVectorColumnsFromML</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseVector</span><span class="p">)</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.convertVectorColumnsToML">
<em class="property">static </em><code class="descname">convertVectorColumnsToML</code><span class="sig-paren">(</span><em>dataset</em>, <em>*cols</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.convertVectorColumnsToML"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.convertVectorColumnsToML" title="Permalink to this definition"></a></dt>
<dd><p>Converts vector columns in an input DataFrame from the
<a class="reference internal" href="#pyspark.mllib.linalg.Vector" title="pyspark.mllib.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.mllib.linalg.Vector</span></code></a> type to the new
<a class="reference internal" href="pyspark.ml.html#pyspark.ml.linalg.Vector" title="pyspark.ml.linalg.Vector"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.linalg.Vector</span></code></a> type under the <cite>spark.ml</cite>
package.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset</li>
<li><strong>cols</strong> – a list of vector columns to be converted.
New vector columns will be ignored. If unspecified, all old
vector columns will be converted excepted nested ones.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the input dataset with old vector columns converted to the
new vector type</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pyspark</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">))],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r1</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertVectorColumnsToML</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r1</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">r2</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">convertVectorColumnsToML</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">SparseVector</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">isinstance</span><span class="p">(</span><span class="n">r2</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">DenseVector</span><span class="p">)</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.loadLabeledPoints">
<em class="property">static </em><code class="descname">loadLabeledPoints</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em>, <em>minPartitions=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.loadLabeledPoints"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.loadLabeledPoints" title="Permalink to this definition"></a></dt>
<dd><p>Load labeled points saved using RDD.saveAsTextFile.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – Spark context</li>
<li><strong>path</strong> – file or directory path in any Hadoop-supported file
system URI</li>
<li><strong>minPartitions</strong> – min number of partitions</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">labeled data stored as an RDD of LabeledPoint</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span> <span class="o">=</span> <span class="p">[</span><span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.23</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">4.56e-7</span><span class="p">)])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.01</span><span class="p">,</span> <span class="mf">2.02</span><span class="p">,</span> <span class="mf">3.03</span><span class="p">]))]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span> <span class="o">=</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">delete</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">examples</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">saveAsTextFile</span><span class="p">(</span><span class="n">tempFile</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLabeledPoints</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">tempFile</span><span class="o">.</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.loadLibSVMFile">
<em class="property">static </em><code class="descname">loadLibSVMFile</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em>, <em>numFeatures=-1</em>, <em>minPartitions=None</em>, <em>multiclass=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.loadLibSVMFile"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.loadLibSVMFile" title="Permalink to this definition"></a></dt>
<dd><p>Loads labeled data in the LIBSVM format into an RDD of
LabeledPoint. The LIBSVM format is a text-based format used by
LIBSVM and LIBLINEAR. Each line represents a labeled sparse
feature vector using the following format:</p>
<p>label index1:value1 index2:value2 …</p>
<p>where the indices are one-based and in ascending order. This
method parses each line into a LabeledPoint, where the feature
indices are converted to zero-based.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>sc</strong> – Spark context</li>
<li><strong>path</strong> – file or directory path in any Hadoop-supported file
system URI</li>
<li><strong>numFeatures</strong> – number of features, which will be determined
from the input data if a nonpositive value
is given. This is useful when the dataset is
already split into multiple files and you
want to load them separately, because some
features may not present in certain files,
which leads to inconsistent feature
dimensions.</li>
<li><strong>minPartitions</strong> – min number of partitions</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">labeled data stored as an RDD of LabeledPoint</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span> <span class="o">=</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">delete</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">_</span> <span class="o">=</span> <span class="n">tempFile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="sa">b</span><span class="s2">&quot;+1 1:1.0 3:2.0 5:3.0</span><span class="se">\n</span><span class="s2">-1</span><span class="se">\n</span><span class="s2">-1 2:4.0 4:5.0 6:6.0&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">tempFile</span><span class="o">.</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">LabeledPoint(-1.0, (6,[],[]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
<span class="go">LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.loadVectors">
<em class="property">static </em><code class="descname">loadVectors</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.loadVectors"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.loadVectors" title="Permalink to this definition"></a></dt>
<dd><p>Loads vectors saved using <cite>RDD[Vector].saveAsTextFile</cite>
with the default number of partitions.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.mllib.util.MLUtils.saveAsLibSVMFile">
<em class="property">static </em><code class="descname">saveAsLibSVMFile</code><span class="sig-paren">(</span><em>data</em>, <em>dir</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#MLUtils.saveAsLibSVMFile"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.MLUtils.saveAsLibSVMFile" title="Permalink to this definition"></a></dt>
<dd><p>Save labeled data in LIBSVM format.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>data</strong> – an RDD of LabeledPoint to be saved</li>
<li><strong>dir</strong> – directory to save the data</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">tempfile</span> <span class="k">import</span> <span class="n">NamedTemporaryFile</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">fileinput</span> <span class="k">import</span> <span class="nb">input</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="k">import</span> <span class="n">LabeledPoint</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">glob</span> <span class="k">import</span> <span class="n">glob</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="k">import</span> <span class="n">MLUtils</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">examples</span> <span class="o">=</span> <span class="p">[</span><span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">1.1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">1.23</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">4.56</span><span class="p">)])),</span>
<span class="gp">... </span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.01</span><span class="p">,</span> <span class="mf">2.02</span><span class="p">,</span> <span class="mf">3.03</span><span class="p">]))]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span> <span class="o">=</span> <span class="n">NamedTemporaryFile</span><span class="p">(</span><span class="n">delete</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tempFile</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">MLUtils</span><span class="o">.</span><span class="n">saveAsLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">examples</span><span class="p">),</span> <span class="n">tempFile</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">sorted</span><span class="p">(</span><span class="nb">input</span><span class="p">(</span><span class="n">glob</span><span class="p">(</span><span class="n">tempFile</span><span class="o">.</span><span class="n">name</span> <span class="o">+</span> <span class="s2">&quot;/part-0000*&quot;</span><span class="p">))))</span>
<span class="go">&#39;0.0 1:1.01 2:2.02 3:3.03\n1.1 1:1.23 3:4.56\n&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.mllib.util.Saveable">
<em class="property">class </em><code class="descclassname">pyspark.mllib.util.</code><code class="descname">Saveable</code><a class="reference internal" href="_modules/pyspark/mllib/util.html#Saveable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.Saveable" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for models and transformers which may be saved as files.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.mllib.util.Saveable.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>sc</em>, <em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/mllib/util.html#Saveable.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.mllib.util.Saveable.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this model to the given path.</p>
<dl class="docutils">
<dt>This saves:</dt>
<dd><ul class="first last simple">
<li>human-readable (JSON) model metadata to path/metadata/</li>
<li>Parquet formatted data to path/data/</li>
</ul>
</dd>
</dl>
<p>The model may be loaded using py:meth:<cite>Loader.load</cite>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>sc</strong> – Spark context used to save model data.</li>
<li><strong>path</strong> – Path specifying the directory in which to save
this model. If the directory already exists,
this method throws an exception.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
</div>
</div>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo"><a href="index.html">
<img class="logo" src="_static/spark-logo-hd.png" alt="Logo"/>
</a></p>
<h3><a href="index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">pyspark.mllib package</a><ul>
<li><a class="reference internal" href="#module-pyspark.mllib.classification">pyspark.mllib.classification module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.clustering">pyspark.mllib.clustering module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.evaluation">pyspark.mllib.evaluation module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.feature">pyspark.mllib.feature module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.fpm">pyspark.mllib.fpm module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.linalg">pyspark.mllib.linalg module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.linalg.distributed">pyspark.mllib.linalg.distributed module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.random">pyspark.mllib.random module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.recommendation">pyspark.mllib.recommendation module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.regression">pyspark.mllib.regression module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.stat">pyspark.mllib.stat module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.tree">pyspark.mllib.tree module</a></li>
<li><a class="reference internal" href="#module-pyspark.mllib.util">pyspark.mllib.util module</a></li>
</ul>
</li>
</ul>
<h4>Previous topic</h4>
<p class="topless"><a href="pyspark.ml.html"
title="previous chapter">pyspark.ml package</a></p>
<div role="note" aria-label="source link">
<h3>This Page</h3>
<ul class="this-page-menu">
<li><a href="_sources/pyspark.mllib.rst.txt"
rel="nofollow">Show Source</a></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3>Quick search</h3>
<form class="search" action="search.html" method="get">
<div><input type="text" name="q" /></div>
<div><input type="submit" value="Go" /></div>
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="pyspark.ml.html" title="pyspark.ml package"
>previous</a></li>
<li class="nav-item nav-item-0"><a href="index.html">PySpark 2.2.1 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="pyspark.html" >pyspark package</a> &#187;</li>
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright .
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.6.5.
</div>
</body>
</html>