blob: a5757cdec007092edefabbb77489f58c29d29abd [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>pyspark.ml package &#8212; PySpark 2.2.1 documentation</title>
<link rel="stylesheet" href="_static/nature.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/pyspark.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: './',
VERSION: '2.2.1',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: '.txt'
};
</script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="_static/pyspark.js"></script>
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="pyspark.mllib package" href="pyspark.mllib.html" />
<link rel="prev" title="pyspark.streaming module" href="pyspark.streaming.html" />
</head>
<body>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="pyspark.mllib.html" title="pyspark.mllib package"
accesskey="N">next</a></li>
<li class="right" >
<a href="pyspark.streaming.html" title="pyspark.streaming module"
accesskey="P">previous</a> |</li>
<li class="nav-item nav-item-0"><a href="index.html">PySpark 2.2.1 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="pyspark.html" accesskey="U">pyspark package</a> &#187;</li>
</ul>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<div class="section" id="pyspark-ml-package">
<h1>pyspark.ml package<a class="headerlink" href="#pyspark-ml-package" title="Permalink to this headline"></a></h1>
<div class="section" id="module-pyspark.ml">
<span id="ml-pipeline-apis"></span><h2>ML Pipeline APIs<a class="headerlink" href="#module-pyspark.ml" title="Permalink to this headline"></a></h2>
<p>DataFrame-based machine learning APIs to let users quickly assemble and configure practical
machine learning pipelines.</p>
<dl class="class">
<dt id="pyspark.ml.Transformer">
<em class="property">class </em><code class="descclassname">pyspark.ml.</code><code class="descname">Transformer</code><a class="reference internal" href="_modules/pyspark/ml/base.html#Transformer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Transformer" title="Permalink to this definition"></a></dt>
<dd><p>Abstract class for transformers that transform one dataset into another.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.Transformer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. The default implementation creates a
shallow copy using <code class="xref py py-func docutils literal"><span class="pre">copy.copy()</span></code>, and then copies the
embedded and extra parameters over and returns the copy.
Subclasses should override this method if the default approach
is not sufficient.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Transformer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.Transformer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.Transformer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Transformer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/base.html#Transformer.transform"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Transformer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.Estimator">
<em class="property">class </em><code class="descclassname">pyspark.ml.</code><code class="descname">Estimator</code><a class="reference internal" href="_modules/pyspark/ml/base.html#Estimator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Estimator" title="Permalink to this definition"></a></dt>
<dd><p>Abstract class for estimators that fit models to data.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.Estimator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. The default implementation creates a
shallow copy using <code class="xref py py-func docutils literal"><span class="pre">copy.copy()</span></code>, and then copies the
embedded and extra parameters over and returns the copy.
Subclasses should override this method if the default approach
is not sufficient.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/base.html#Estimator.fit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Estimator.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Estimator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Estimator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.Estimator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.Estimator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.Model">
<em class="property">class </em><code class="descclassname">pyspark.ml.</code><code class="descname">Model</code><a class="reference internal" href="_modules/pyspark/ml/base.html#Model"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Model" title="Permalink to this definition"></a></dt>
<dd><p>Abstract class for models that are fitted by estimators.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.Model.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. The default implementation creates a
shallow copy using <code class="xref py py-func docutils literal"><span class="pre">copy.copy()</span></code>, and then copies the
embedded and extra parameters over and returns the copy.
Subclasses should override this method if the default approach
is not sufficient.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.Model.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.Model.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Model.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Model.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.Pipeline">
<em class="property">class </em><code class="descclassname">pyspark.ml.</code><code class="descname">Pipeline</code><span class="sig-paren">(</span><em>stages=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline" title="Permalink to this definition"></a></dt>
<dd><p>A simple pipeline, which acts as an estimator. A Pipeline consists
of a sequence of stages, each of which is either an
<a class="reference internal" href="#pyspark.ml.Estimator" title="pyspark.ml.Estimator"><code class="xref py py-class docutils literal"><span class="pre">Estimator</span></code></a> or a <a class="reference internal" href="#pyspark.ml.Transformer" title="pyspark.ml.Transformer"><code class="xref py py-class docutils literal"><span class="pre">Transformer</span></code></a>. When
<a class="reference internal" href="#pyspark.ml.Pipeline.fit" title="pyspark.ml.Pipeline.fit"><code class="xref py py-meth docutils literal"><span class="pre">Pipeline.fit()</span></code></a> is called, the stages are executed in
order. If a stage is an <a class="reference internal" href="#pyspark.ml.Estimator" title="pyspark.ml.Estimator"><code class="xref py py-class docutils literal"><span class="pre">Estimator</span></code></a>, its
<a class="reference internal" href="#pyspark.ml.Estimator.fit" title="pyspark.ml.Estimator.fit"><code class="xref py py-meth docutils literal"><span class="pre">Estimator.fit()</span></code></a> method will be called on the input
dataset to fit a model. Then the model, which is a transformer,
will be used to transform the dataset as the input to the next
stage. If a stage is a <a class="reference internal" href="#pyspark.ml.Transformer" title="pyspark.ml.Transformer"><code class="xref py py-class docutils literal"><span class="pre">Transformer</span></code></a>, its
<a class="reference internal" href="#pyspark.ml.Transformer.transform" title="pyspark.ml.Transformer.transform"><code class="xref py py-meth docutils literal"><span class="pre">Transformer.transform()</span></code></a> method will be called to produce
the dataset for the next stage. The fitted model from a
<a class="reference internal" href="#pyspark.ml.Pipeline" title="pyspark.ml.Pipeline"><code class="xref py py-class docutils literal"><span class="pre">Pipeline</span></code></a> is a <a class="reference internal" href="#pyspark.ml.PipelineModel" title="pyspark.ml.PipelineModel"><code class="xref py py-class docutils literal"><span class="pre">PipelineModel</span></code></a>, which
consists of fitted models and transformers, corresponding to the
pipeline stages. If stages is an empty list, the pipeline acts as an
identity transformer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.Pipeline.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra parameters</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">new instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.getStages">
<code class="descname">getStages</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.getStages"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.getStages" title="Permalink to this definition"></a></dt>
<dd><p>Get pipeline stages.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.Pipeline.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.Pipeline.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.Pipeline.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.Pipeline.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>stages=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for Pipeline.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.setStages">
<code class="descname">setStages</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.setStages"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.setStages" title="Permalink to this definition"></a></dt>
<dd><p>Set pipeline stages.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>value</strong> – a list of transformers or estimators</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the pipeline instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.Pipeline.stages">
<code class="descname">stages</code><em class="property"> = Param(parent='undefined', name='stages', doc='a list of pipeline stages')</em><a class="headerlink" href="#pyspark.ml.Pipeline.stages" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.Pipeline.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#Pipeline.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.Pipeline.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.PipelineModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.</code><code class="descname">PipelineModel</code><span class="sig-paren">(</span><em>stages</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#PipelineModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.PipelineModel" title="Permalink to this definition"></a></dt>
<dd><p>Represents a compiled pipeline with transformers and fitted models.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#PipelineModel.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.PipelineModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra parameters</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">new instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.PipelineModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.PipelineModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.PipelineModel.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#PipelineModel.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.PipelineModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#PipelineModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.PipelineModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.PipelineModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.PipelineModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/pipeline.html#PipelineModel.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.PipelineModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.param">
<span id="pyspark-ml-param-module"></span><h2>pyspark.ml.param module<a class="headerlink" href="#module-pyspark.ml.param" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.param.Param">
<em class="property">class </em><code class="descclassname">pyspark.ml.param.</code><code class="descname">Param</code><span class="sig-paren">(</span><em>parent</em>, <em>name</em>, <em>doc</em>, <em>typeConverter=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Param"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Param" title="Permalink to this definition"></a></dt>
<dd><p>A param with self-contained documentation.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.param.Params">
<em class="property">class </em><code class="descclassname">pyspark.ml.param.</code><code class="descname">Params</code><a class="reference internal" href="_modules/pyspark/ml/param.html#Params"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params" title="Permalink to this definition"></a></dt>
<dd><p>Components that take parameters. This also provides an internal
param map to store parameter values attached to the instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.param.Params.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. The default implementation creates a
shallow copy using <code class="xref py py-func docutils literal"><span class="pre">copy.copy()</span></code>, and then copies the
embedded and extra parameters over and returns the copy.
Subclasses should override this method if the default approach
is not sufficient.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.explainParam"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.explainParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.extractParamMap"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.getOrDefault"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.getParam"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.hasDefault"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.hasParam"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.isDefined"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.param.Params.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#Params.isSet"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.Params.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.param.Params.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.param.Params.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<a class="reference internal" href="#pyspark.ml.param.Param" title="pyspark.ml.param.Param"><code class="xref py py-class docutils literal"><span class="pre">Param</span></code></a>.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.param.TypeConverters">
<em class="property">class </em><code class="descclassname">pyspark.ml.param.</code><code class="descname">TypeConverters</code><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">DeveloperApi</p>
</div>
<p>Factory methods for common type conversion functions for <cite>Param.typeConverter</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.identity">
<em class="property">static </em><code class="descname">identity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.identity"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.identity" title="Permalink to this definition"></a></dt>
<dd><p>Dummy converter that just returns value.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toBoolean">
<em class="property">static </em><code class="descname">toBoolean</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toBoolean"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toBoolean" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to a boolean, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toFloat">
<em class="property">static </em><code class="descname">toFloat</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toFloat"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toFloat" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to a float, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toInt">
<em class="property">static </em><code class="descname">toInt</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toInt"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toInt" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to an int, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toList">
<em class="property">static </em><code class="descname">toList</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toList"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toList" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to a list, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toListFloat">
<em class="property">static </em><code class="descname">toListFloat</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toListFloat"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toListFloat" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to list of floats, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toListInt">
<em class="property">static </em><code class="descname">toListInt</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toListInt"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toListInt" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to list of ints, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toListString">
<em class="property">static </em><code class="descname">toListString</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toListString"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toListString" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to list of strings, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toString">
<em class="property">static </em><code class="descname">toString</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toString"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toString" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to a string, if possible.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.param.TypeConverters.toVector">
<em class="property">static </em><code class="descname">toVector</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/param.html#TypeConverters.toVector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.param.TypeConverters.toVector" title="Permalink to this definition"></a></dt>
<dd><p>Convert a value to a MLlib Vector, if possible.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.feature">
<span id="pyspark-ml-feature-module"></span><h2>pyspark.ml.feature module<a class="headerlink" href="#module-pyspark.ml.feature" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.feature.Binarizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Binarizer</code><span class="sig-paren">(</span><em>threshold=0.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Binarizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Binarizer" title="Permalink to this definition"></a></dt>
<dd><p>Binarize a column of continuous features given a threshold.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="mf">0.5</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;values&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizer</span> <span class="o">=</span> <span class="n">Binarizer</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;values&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">features</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">binarizer</span><span class="o">.</span><span class="n">threshold</span><span class="p">:</span> <span class="o">-</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/binarizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">binarizer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">binarizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedBinarizer</span> <span class="o">=</span> <span class="n">Binarizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">binarizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedBinarizer</span><span class="o">.</span><span class="n">getThreshold</span><span class="p">()</span> <span class="o">==</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">getThreshold</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.getThreshold">
<code class="descname">getThreshold</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Binarizer.getThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Binarizer.getThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of threshold or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Binarizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Binarizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Binarizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Binarizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Binarizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Binarizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Binarizer.inputCol" title="pyspark.ml.feature.Binarizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Binarizer.outputCol" title="pyspark.ml.feature.Binarizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>threshold=0.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Binarizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Binarizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Binarizer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.setThreshold">
<code class="descname">setThreshold</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Binarizer.setThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Binarizer.setThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Binarizer.threshold" title="pyspark.ml.feature.Binarizer.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Binarizer.threshold">
<code class="descname">threshold</code><em class="property"> = Param(parent='undefined', name='threshold', doc='threshold in binary classification prediction, in range [0, 1]')</em><a class="headerlink" href="#pyspark.ml.feature.Binarizer.threshold" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Binarizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Binarizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">BucketedRandomProjectionLSH</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>seed=None</em>, <em>numHashTables=1</em>, <em>bucketLength=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#BucketedRandomProjectionLSH"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>LSH class for Euclidean distance metrics.
The input is dense or sparse vectors, each of which represents a point in the Euclidean
distance space. The output will be vectors of configurable dimension. Hash values in the same
dimension are calculated by the same hash function.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions">Stable Distributions</a></p>
</div>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://arxiv.org/abs/1408.2927">Hashing for Similarity Search: A Survey</a></p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="k">import</span> <span class="n">col</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">brp</span> <span class="o">=</span> <span class="n">BucketedRandomProjectionLSH</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;hashes&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">seed</span><span class="o">=</span><span class="mi">12345</span><span class="p">,</span> <span class="n">bucketLength</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">brp</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data2</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">4</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">2.0</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">7</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data2</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">approxNearestNeighbors</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">approxSimilarityJoin</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="n">distCol</span><span class="o">=</span><span class="s2">&quot;EuclideanDistance&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;datasetA.id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;idA&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;datasetB.id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;idB&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;EuclideanDistance&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+-----------------+</span>
<span class="go">|idA|idB|EuclideanDistance|</span>
<span class="go">+---+---+-----------------+</span>
<span class="go">| 3| 6| 2.23606797749979|</span>
<span class="go">+---+---+-----------------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">brpPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/brp&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">brp</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">brpPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">brp2</span> <span class="o">=</span> <span class="n">BucketedRandomProjectionLSH</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">brpPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">brp2</span><span class="o">.</span><span class="n">getBucketLength</span><span class="p">()</span> <span class="o">==</span> <span class="n">brp</span><span class="o">.</span><span class="n">getBucketLength</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/brp-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">BucketedRandomProjectionLSHModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">hashes</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">hashes</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.bucketLength">
<code class="descname">bucketLength</code><em class="property"> = Param(parent='undefined', name='bucketLength', doc='the length of each hash bucket, a larger bucket lowers the false negative rate.')</em><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.bucketLength" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getBucketLength">
<code class="descname">getBucketLength</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#BucketedRandomProjectionLSH.getBucketLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getBucketLength" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of bucketLength or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getNumHashTables">
<code class="descname">getNumHashTables</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getNumHashTables" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numHashTables or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.numHashTables">
<code class="descname">numHashTables</code><em class="property"> = Param(parent='undefined', name='numHashTables', doc='number of hash tables, where increasing number of hash tables lowers the false negative rate, and decreasing it improves the running performance.')</em><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.numHashTables" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setBucketLength">
<code class="descname">setBucketLength</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#BucketedRandomProjectionLSH.setBucketLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setBucketLength" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.bucketLength" title="pyspark.ml.feature.BucketedRandomProjectionLSH.bucketLength"><code class="xref py py-attr docutils literal"><span class="pre">bucketLength</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.inputCol" title="pyspark.ml.feature.BucketedRandomProjectionLSH.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setNumHashTables">
<code class="descname">setNumHashTables</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setNumHashTables" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.numHashTables" title="pyspark.ml.feature.BucketedRandomProjectionLSH.numHashTables"><code class="xref py py-attr docutils literal"><span class="pre">numHashTables</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.outputCol" title="pyspark.ml.feature.BucketedRandomProjectionLSH.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>seed=None</em>, <em>numHashTables=1</em>, <em>bucketLength=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#BucketedRandomProjectionLSH.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this BucketedRandomProjectionLSH.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.seed" title="pyspark.ml.feature.BucketedRandomProjectionLSH.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSH.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSH.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">BucketedRandomProjectionLSHModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#BucketedRandomProjectionLSHModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.BucketedRandomProjectionLSH" title="pyspark.ml.feature.BucketedRandomProjectionLSH"><code class="xref py py-class docutils literal"><span class="pre">BucketedRandomProjectionLSH</span></code></a>, where multiple random vectors are
stored. The vectors are normalized to be unit vectors and each vector is used in a hash
function: <span class="math">\(h_i(x) = floor(r_i \cdot x / bucketLength)\)</span> where <span class="math">\(r_i\)</span> is the
i-th random unit vector. The number of buckets will be <cite>(max L2 norm of input vectors) /
bucketLength</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.approxNearestNeighbors">
<code class="descname">approxNearestNeighbors</code><span class="sig-paren">(</span><em>dataset</em>, <em>key</em>, <em>numNearestNeighbors</em>, <em>distCol='distCol'</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.approxNearestNeighbors" title="Permalink to this definition"></a></dt>
<dd><p>Given a large dataset and an item, approximately find at most k items which have the
closest distance to the item. If the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> is missing, the method will
transform the data; if the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> exists, it will use that. This allows
caching of the transformed data when necessary.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This method is experimental and will likely change behavior in the next release.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – The dataset to search for nearest neighbors of the key.</li>
<li><strong>key</strong> – Feature vector representing the item to search for.</li>
<li><strong>numNearestNeighbors</strong> – The maximum number of nearest neighbors.</li>
<li><strong>distCol</strong> – Output column for storing the distance between each result row and the key.
Use “distCol” as default value if it’s not specified.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dataset containing at most k items closest to the key. A column “distCol” is
added to show the distance between each row and the key.</p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.approxSimilarityJoin">
<code class="descname">approxSimilarityJoin</code><span class="sig-paren">(</span><em>datasetA</em>, <em>datasetB</em>, <em>threshold</em>, <em>distCol='distCol'</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.approxSimilarityJoin" title="Permalink to this definition"></a></dt>
<dd><p>Join two datasets to approximately find all pairs of rows whose distance are smaller than
the threshold. If the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> is missing, the method will transform the data;
if the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> exists, it will use that. This allows caching of the
transformed data when necessary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>datasetA</strong> – One of the datasets to join.</li>
<li><strong>datasetB</strong> – Another dataset to join.</li>
<li><strong>threshold</strong> – The threshold for the distance of row pairs.</li>
<li><strong>distCol</strong> – Output column for storing the distance between each pair of rows. Use
“distCol” as default value if it’s not specified.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A joined dataset containing pairs of rows. The original rows are in columns
“datasetA” and “datasetB”, and a column “distCol” is added to show the distance
between each pair.</p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.BucketedRandomProjectionLSHModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.BucketedRandomProjectionLSHModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Bucketizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Bucketizer</code><span class="sig-paren">(</span><em>splits=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>handleInvalid='error'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer" title="Permalink to this definition"></a></dt>
<dd><p>Maps a column of continuous features to a column of feature buckets.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">values</span> <span class="o">=</span> <span class="p">[(</span><span class="mf">0.1</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.4</span><span class="p">,),</span> <span class="p">(</span><span class="mf">1.2</span><span class="p">,),</span> <span class="p">(</span><span class="mf">1.5</span><span class="p">,),</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),),</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;values&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketizer</span> <span class="o">=</span> <span class="n">Bucketizer</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="p">[</span><span class="o">-</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;inf&quot;</span><span class="p">),</span> <span class="mf">0.5</span><span class="p">,</span> <span class="mf">1.4</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="s2">&quot;inf&quot;</span><span class="p">)],</span>
<span class="gp">... </span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;values&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;buckets&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;keep&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">bucketed</span><span class="p">)</span>
<span class="go">6</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">buckets</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">buckets</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">buckets</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">buckets</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;b&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">b</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/bucketizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketizer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">bucketizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedBucketizer</span> <span class="o">=</span> <span class="n">Bucketizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">bucketizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedBucketizer</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()</span> <span class="o">==</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;skip&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">bucketed</span><span class="p">)</span>
<span class="go">4</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getHandleInvalid">
<code class="descname">getHandleInvalid</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer.getHandleInvalid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.Bucketizer.handleInvalid" title="pyspark.ml.feature.Bucketizer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.getSplits">
<code class="descname">getSplits</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer.getSplits"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.getSplits" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of threshold or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Bucketizer.handleInvalid">
<code class="descname">handleInvalid</code><em class="property"> = Param(parent='undefined', name='handleInvalid', doc=&quot;how to handle invalid entries. Options are 'skip' (filter out rows with invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special additional bucket).&quot;)</em><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.handleInvalid" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Bucketizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Bucketizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Bucketizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.setHandleInvalid">
<code class="descname">setHandleInvalid</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer.setHandleInvalid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.setHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Bucketizer.handleInvalid" title="pyspark.ml.feature.Bucketizer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Bucketizer.inputCol" title="pyspark.ml.feature.Bucketizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Bucketizer.outputCol" title="pyspark.ml.feature.Bucketizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>splits=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>handleInvalid=&quot;error&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Bucketizer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.setSplits">
<code class="descname">setSplits</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Bucketizer.setSplits"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.setSplits" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Bucketizer.splits" title="pyspark.ml.feature.Bucketizer.splits"><code class="xref py py-attr docutils literal"><span class="pre">splits</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Bucketizer.splits">
<code class="descname">splits</code><em class="property"> = Param(parent='undefined', name='splits', doc='Split points for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. The splits should be of length &gt;= 3 and strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, values outside the splits specified will be treated as errors.')</em><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.splits" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Bucketizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Bucketizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.ChiSqSelector">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">ChiSqSelector</code><span class="sig-paren">(</span><em>numTopFeatures=50</em>, <em>featuresCol='features'</em>, <em>outputCol=None</em>, <em>labelCol='label'</em>, <em>selectorType='numTopFeatures'</em>, <em>percentile=0.1</em>, <em>fpr=0.05</em>, <em>fdr=0.05</em>, <em>fwe=0.05</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Chi-Squared feature selection, which selects categorical features to use for predicting a
categorical label.
The selector supports different selection methods: <cite>numTopFeatures</cite>, <cite>percentile</cite>, <cite>fpr</cite>,
<cite>fdr</cite>, <cite>fwe</cite>.</p>
<blockquote>
<div><ul class="simple">
<li><cite>numTopFeatures</cite> chooses a fixed number of top features according to a chi-squared test.</li>
<li><cite>percentile</cite> is similar but chooses a fraction of all features
instead of a fixed number.</li>
<li><cite>fpr</cite> chooses all features whose p-values are below a threshold,
thus controlling the false positive rate of selection.</li>
<li><cite>fdr</cite> uses the <a class="reference external" href="https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a>
to choose all features whose false discovery rate is below a threshold.</li>
<li><cite>fwe</cite> chooses all features whose p-values are below a threshold. The threshold is scaled by
1/numFeatures, thus controlling the family-wise error rate of selection.</li>
</ul>
</div></blockquote>
<p>By default, the selection method is <cite>numTopFeatures</cite>, with the default number of top features
set to 50.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">18.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">12.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">15.0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">)],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">selector</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">selector</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">selectedFeatures</span>
<span class="go">DenseVector([18.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">selectedFeatures</span>
<span class="go">[2]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">chiSqSelectorPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/chi-sq-selector&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">selector</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">chiSqSelectorPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedSelector</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">chiSqSelectorPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedSelector</span><span class="o">.</span><span class="n">getNumTopFeatures</span><span class="p">()</span> <span class="o">==</span> <span class="n">selector</span><span class="o">.</span><span class="n">getNumTopFeatures</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/chi-sq-selector-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">ChiSqSelectorModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">selectedFeatures</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">selectedFeatures</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.fdr">
<code class="descname">fdr</code><em class="property"> = Param(parent='undefined', name='fdr', doc='The upper bound of the expected false discovery rate.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.fdr" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.fpr">
<code class="descname">fpr</code><em class="property"> = Param(parent='undefined', name='fpr', doc='The highest p-value for features to be kept.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.fpr" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.fwe">
<code class="descname">fwe</code><em class="property"> = Param(parent='undefined', name='fwe', doc='The upper bound of the expected family-wise error rate.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.fwe" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getFdr">
<code class="descname">getFdr</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getFdr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getFdr" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fdr or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getFpr">
<code class="descname">getFpr</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getFpr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getFpr" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fpr or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getFwe">
<code class="descname">getFwe</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getFwe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getFwe" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fwe or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getNumTopFeatures">
<code class="descname">getNumTopFeatures</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getNumTopFeatures"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getNumTopFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numTopFeatures or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getPercentile">
<code class="descname">getPercentile</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getPercentile"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getPercentile" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of percentile or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.getSelectorType">
<code class="descname">getSelectorType</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.getSelectorType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.getSelectorType" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of selectorType or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.numTopFeatures">
<code class="descname">numTopFeatures</code><em class="property"> = Param(parent='undefined', name='numTopFeatures', doc='Number of features that selector will select, ordered by ascending p-value. If the number of features is &lt; numTopFeatures, then this will select all features.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.numTopFeatures" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.percentile">
<code class="descname">percentile</code><em class="property"> = Param(parent='undefined', name='percentile', doc='Percentile of features that selector will select, ordered by ascending p-value.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.percentile" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelector.selectorType">
<code class="descname">selectorType</code><em class="property"> = Param(parent='undefined', name='selectorType', doc='The selector type of the ChisqSelector. Supported options: numTopFeatures (default), percentile and fpr.')</em><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.selectorType" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setFdr">
<code class="descname">setFdr</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setFdr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setFdr" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.fdr" title="pyspark.ml.feature.ChiSqSelector.fdr"><code class="xref py py-attr docutils literal"><span class="pre">fdr</span></code></a>.
Only applicable when selectorType = “fdr”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.featuresCol" title="pyspark.ml.feature.ChiSqSelector.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setFpr">
<code class="descname">setFpr</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setFpr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setFpr" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.fpr" title="pyspark.ml.feature.ChiSqSelector.fpr"><code class="xref py py-attr docutils literal"><span class="pre">fpr</span></code></a>.
Only applicable when selectorType = “fpr”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setFwe">
<code class="descname">setFwe</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setFwe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setFwe" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.fwe" title="pyspark.ml.feature.ChiSqSelector.fwe"><code class="xref py py-attr docutils literal"><span class="pre">fwe</span></code></a>.
Only applicable when selectorType = “fwe”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.labelCol" title="pyspark.ml.feature.ChiSqSelector.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setNumTopFeatures">
<code class="descname">setNumTopFeatures</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setNumTopFeatures"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setNumTopFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.numTopFeatures" title="pyspark.ml.feature.ChiSqSelector.numTopFeatures"><code class="xref py py-attr docutils literal"><span class="pre">numTopFeatures</span></code></a>.
Only applicable when selectorType = “numTopFeatures”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.outputCol" title="pyspark.ml.feature.ChiSqSelector.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>numTopFeatures=50</em>, <em>featuresCol=&quot;features&quot;</em>, <em>outputCol=None</em>, <em>labelCol=&quot;labels&quot;</em>, <em>selectorType=&quot;numTopFeatures&quot;</em>, <em>percentile=0.1</em>, <em>fpr=0.05</em>, <em>fdr=0.05</em>, <em>fwe=0.05</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this ChiSqSelector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setPercentile">
<code class="descname">setPercentile</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setPercentile"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setPercentile" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.percentile" title="pyspark.ml.feature.ChiSqSelector.percentile"><code class="xref py py-attr docutils literal"><span class="pre">percentile</span></code></a>.
Only applicable when selectorType = “percentile”.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.setSelectorType">
<code class="descname">setSelectorType</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelector.setSelectorType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.setSelectorType" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector.selectorType" title="pyspark.ml.feature.ChiSqSelector.selectorType"><code class="xref py py-attr docutils literal"><span class="pre">selectorType</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelector.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelector.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.ChiSqSelectorModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">ChiSqSelectorModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ChiSqSelectorModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.ChiSqSelector" title="pyspark.ml.feature.ChiSqSelector"><code class="xref py py-class docutils literal"><span class="pre">ChiSqSelector</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.selectedFeatures">
<code class="descname">selectedFeatures</code><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.selectedFeatures" title="Permalink to this definition"></a></dt>
<dd><p>List of indices to select (filter).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ChiSqSelectorModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ChiSqSelectorModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.CountVectorizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">CountVectorizer</code><span class="sig-paren">(</span><em>minTF=1.0</em>, <em>minDF=1.0</em>, <em>vocabSize=262144</em>, <em>binary=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer" title="Permalink to this definition"></a></dt>
<dd><p>Extracts a vocabulary from document collections and generates a <a class="reference internal" href="#pyspark.ml.feature.CountVectorizerModel" title="pyspark.ml.feature.CountVectorizerModel"><code class="xref py py-attr docutils literal"><span class="pre">CountVectorizerModel</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">]),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;raw&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cv</span> <span class="o">=</span> <span class="n">CountVectorizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;raw&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;vectors&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">cv</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="go">+-----+---------------+-------------------------+</span>
<span class="go">|label|raw |vectors |</span>
<span class="go">+-----+---------------+-------------------------+</span>
<span class="go">|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span>
<span class="go">|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span>
<span class="go">+-----+---------------+-------------------------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">vocabulary</span><span class="p">)</span> <span class="o">==</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;b&#39;</span><span class="p">,</span> <span class="s1">&#39;c&#39;</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">countVectorizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/count-vectorizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cv</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">countVectorizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedCv</span> <span class="o">=</span> <span class="n">CountVectorizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">countVectorizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedCv</span><span class="o">.</span><span class="n">getMinDF</span><span class="p">()</span> <span class="o">==</span> <span class="n">cv</span><span class="o">.</span><span class="n">getMinDF</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedCv</span><span class="o">.</span><span class="n">getMinTF</span><span class="p">()</span> <span class="o">==</span> <span class="n">cv</span><span class="o">.</span><span class="n">getMinTF</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedCv</span><span class="o">.</span><span class="n">getVocabSize</span><span class="p">()</span> <span class="o">==</span> <span class="n">cv</span><span class="o">.</span><span class="n">getVocabSize</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/count-vectorizer-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">vocabulary</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">vocabulary</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.binary">
<code class="descname">binary</code><em class="property"> = Param(parent='undefined', name='binary', doc='Binary toggle to control the output vector values. If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False')</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getBinary">
<code class="descname">getBinary</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.getBinary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getBinary" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of binary or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getMinDF">
<code class="descname">getMinDF</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.getMinDF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getMinDF" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minDF or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getMinTF">
<code class="descname">getMinTF</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.getMinTF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getMinTF" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minTF or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.getVocabSize">
<code class="descname">getVocabSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.getVocabSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.getVocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of vocabSize or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.minDF">
<code class="descname">minDF</code><em class="property"> = Param(parent='undefined', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer &gt;= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0')</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.minDF" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.minTF">
<code class="descname">minTF</code><em class="property"> = Param(parent='undefined', name='minTF', doc=&quot;Filter to ignore rare words in a document. For each document, terms with frequency/count less than the given threshold are ignored. If this is an integer &gt;= 1, then this specifies a count (of times the term must appear in the document); if this is a double in [0,1), then this specifies a fraction (out of the document's token count). Note that the parameter is only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0&quot;)</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.minTF" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setBinary">
<code class="descname">setBinary</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.setBinary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setBinary" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.binary" title="pyspark.ml.feature.CountVectorizer.binary"><code class="xref py py-attr docutils literal"><span class="pre">binary</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.inputCol" title="pyspark.ml.feature.CountVectorizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setMinDF">
<code class="descname">setMinDF</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.setMinDF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setMinDF" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.minDF" title="pyspark.ml.feature.CountVectorizer.minDF"><code class="xref py py-attr docutils literal"><span class="pre">minDF</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setMinTF">
<code class="descname">setMinTF</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.setMinTF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setMinTF" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.minTF" title="pyspark.ml.feature.CountVectorizer.minTF"><code class="xref py py-attr docutils literal"><span class="pre">minTF</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.outputCol" title="pyspark.ml.feature.CountVectorizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>minTF=1.0</em>, <em>minDF=1.0</em>, <em>vocabSize=1 &lt;&lt; 18</em>, <em>binary=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Set the params for the CountVectorizer</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.setVocabSize">
<code class="descname">setVocabSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizer.setVocabSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.setVocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer.vocabSize" title="pyspark.ml.feature.CountVectorizer.vocabSize"><code class="xref py py-attr docutils literal"><span class="pre">vocabSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizer.vocabSize">
<code class="descname">vocabSize</code><em class="property"> = Param(parent='undefined', name='vocabSize', doc='max size of the vocabulary. Default 1 &lt;&lt; 18.')</em><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.vocabSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.CountVectorizerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">CountVectorizerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#CountVectorizerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer" title="pyspark.ml.feature.CountVectorizer"><code class="xref py py-class docutils literal"><span class="pre">CountVectorizer</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.CountVectorizerModel.vocabulary">
<code class="descname">vocabulary</code><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.vocabulary" title="Permalink to this definition"></a></dt>
<dd><p>An array of terms in the vocabulary.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.CountVectorizerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.CountVectorizerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.DCT">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">DCT</code><span class="sig-paren">(</span><em>inverse=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#DCT"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.DCT" title="Permalink to this definition"></a></dt>
<dd><p>A feature transformer that takes the 1D discrete cosine transform
of a real vector. No zero padding is performed on the input vector.
It returns a real vector of the same length representing the DCT.
The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II).</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-IIWikipedia">More information on Wikipedia</a>.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">5.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;vec&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dct</span> <span class="o">=</span> <span class="n">DCT</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;vec&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;resultVec&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">dct</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">resultVec</span>
<span class="go">DenseVector([10.969..., -0.707..., -2.041...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df3</span> <span class="o">=</span> <span class="n">DCT</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;resultVec&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;origVec&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df3</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">origVec</span>
<span class="go">DenseVector([5.0, 8.0, 6.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dctPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/dct&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dct</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">dctPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedDtc</span> <span class="o">=</span> <span class="n">DCT</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">dctPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedDtc</span><span class="o">.</span><span class="n">getInverse</span><span class="p">()</span>
<span class="go">False</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.getInverse">
<code class="descname">getInverse</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#DCT.getInverse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.DCT.getInverse" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inverse or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.DCT.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.DCT.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.DCT.inverse">
<code class="descname">inverse</code><em class="property"> = Param(parent='undefined', name='inverse', doc='Set transformer to perform inverse DCT, default False.')</em><a class="headerlink" href="#pyspark.ml.feature.DCT.inverse" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.DCT.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.DCT.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.DCT.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.DCT.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.DCT.inputCol" title="pyspark.ml.feature.DCT.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.setInverse">
<code class="descname">setInverse</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#DCT.setInverse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.DCT.setInverse" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.DCT.inverse" title="pyspark.ml.feature.DCT.inverse"><code class="xref py py-attr docutils literal"><span class="pre">inverse</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.DCT.outputCol" title="pyspark.ml.feature.DCT.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inverse=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#DCT.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.DCT.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this DCT.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.DCT.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.DCT.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.ElementwiseProduct">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">ElementwiseProduct</code><span class="sig-paren">(</span><em>scalingVec=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ElementwiseProduct"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct" title="Permalink to this definition"></a></dt>
<dd><p>Outputs the Hadamard product (i.e., the element-wise product) of each input vector
with a provided “weight” vector. In other words, it scales each column of the dataset
by a scalar multiplier.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;values&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ep</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),</span>
<span class="gp">... </span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;values&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;eprod&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ep</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">eprod</span>
<span class="go">DenseVector([2.0, 2.0, 9.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ep</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]))</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">eprod</span>
<span class="go">DenseVector([4.0, 3.0, 15.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">elementwiseProductPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/elementwise-product&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ep</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">elementwiseProductPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedEp</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">elementwiseProductPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedEp</span><span class="o">.</span><span class="n">getScalingVec</span><span class="p">()</span> <span class="o">==</span> <span class="n">ep</span><span class="o">.</span><span class="n">getScalingVec</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.getScalingVec">
<code class="descname">getScalingVec</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ElementwiseProduct.getScalingVec"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.getScalingVec" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of scalingVec or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ElementwiseProduct.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ElementwiseProduct.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ElementwiseProduct.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ElementwiseProduct.scalingVec">
<code class="descname">scalingVec</code><em class="property"> = Param(parent='undefined', name='scalingVec', doc='Vector for hadamard product.')</em><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.scalingVec" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ElementwiseProduct.inputCol" title="pyspark.ml.feature.ElementwiseProduct.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ElementwiseProduct.outputCol" title="pyspark.ml.feature.ElementwiseProduct.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>scalingVec=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ElementwiseProduct.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this ElementwiseProduct.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.setScalingVec">
<code class="descname">setScalingVec</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ElementwiseProduct.setScalingVec"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.setScalingVec" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.ElementwiseProduct.scalingVec" title="pyspark.ml.feature.ElementwiseProduct.scalingVec"><code class="xref py py-attr docutils literal"><span class="pre">scalingVec</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ElementwiseProduct.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ElementwiseProduct.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.HashingTF">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">HashingTF</code><span class="sig-paren">(</span><em>numFeatures=262144</em>, <em>binary=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#HashingTF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.HashingTF" title="Permalink to this definition"></a></dt>
<dd><p>Maps a sequence of terms to their term frequencies using the hashing trick.
Currently we use Austin Appleby’s MurmurHash 3 algorithm (MurmurHash3_x86_32)
to calculate the hash code value for the term object.
Since a simple modulo is used to transform the hash function to a column index,
it is advisable to use a power of two as the numFeatures parameter;
otherwise the features will not be mapped evenly to the columns.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([([</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],)],</span> <span class="p">[</span><span class="s2">&quot;words&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;words&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">features</span>
<span class="go">SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTF</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">hashingTF</span><span class="o">.</span><span class="n">numFeatures</span><span class="p">:</span> <span class="mi">5</span><span class="p">,</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">SparseVector(5, {0: 1.0, 1: 1.0, 2: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTFPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/hashing-tf&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">hashingTF</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">hashingTFPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedHashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">hashingTFPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedHashingTF</span><span class="o">.</span><span class="n">getNumFeatures</span><span class="p">()</span> <span class="o">==</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">getNumFeatures</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.feature.HashingTF.binary">
<code class="descname">binary</code><em class="property"> = Param(parent='undefined', name='binary', doc='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.')</em><a class="headerlink" href="#pyspark.ml.feature.HashingTF.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getBinary">
<code class="descname">getBinary</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#HashingTF.getBinary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getBinary" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of binary or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getNumFeatures">
<code class="descname">getNumFeatures</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getNumFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numFeatures or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.HashingTF.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.HashingTF.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.HashingTF.numFeatures">
<code class="descname">numFeatures</code><em class="property"> = Param(parent='undefined', name='numFeatures', doc='number of features.')</em><a class="headerlink" href="#pyspark.ml.feature.HashingTF.numFeatures" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.HashingTF.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.HashingTF.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.HashingTF.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.HashingTF.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.setBinary">
<code class="descname">setBinary</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#HashingTF.setBinary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.HashingTF.setBinary" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.HashingTF.binary" title="pyspark.ml.feature.HashingTF.binary"><code class="xref py py-attr docutils literal"><span class="pre">binary</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.HashingTF.inputCol" title="pyspark.ml.feature.HashingTF.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.setNumFeatures">
<code class="descname">setNumFeatures</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.setNumFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.HashingTF.numFeatures" title="pyspark.ml.feature.HashingTF.numFeatures"><code class="xref py py-attr docutils literal"><span class="pre">numFeatures</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.HashingTF.outputCol" title="pyspark.ml.feature.HashingTF.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>numFeatures=1 &lt;&lt; 18</em>, <em>binary=False</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#HashingTF.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.HashingTF.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this HashingTF.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.HashingTF.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.HashingTF.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.IDF">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">IDF</code><span class="sig-paren">(</span><em>minDocFreq=0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IDF"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IDF" title="Permalink to this definition"></a></dt>
<dd><p>Compute the Inverse Document Frequency (IDF) given a collection of documents.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">DenseVector</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;tf&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;tf&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;idf&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">idf</span>
<span class="go">DenseVector([0.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">idf</span>
<span class="go">DenseVector([0.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idf</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">DenseVector([0.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">idf</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="n">idf</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">DenseVector([0.2877, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idfPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/idf&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">idf</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">idfPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIdf</span> <span class="o">=</span> <span class="n">IDF</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">idfPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIdf</span><span class="o">.</span><span class="n">getMinDocFreq</span><span class="p">()</span> <span class="o">==</span> <span class="n">idf</span><span class="o">.</span><span class="n">getMinDocFreq</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/idf-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">IDFModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">idf</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">idf</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.getMinDocFreq">
<code class="descname">getMinDocFreq</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IDF.getMinDocFreq"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IDF.getMinDocFreq" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minDocFreq or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDF.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.IDF.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDF.minDocFreq">
<code class="descname">minDocFreq</code><em class="property"> = Param(parent='undefined', name='minDocFreq', doc='minimum number of documents in which a term should appear for filtering')</em><a class="headerlink" href="#pyspark.ml.feature.IDF.minDocFreq" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDF.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.IDF.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDF.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.IDF.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IDF.inputCol" title="pyspark.ml.feature.IDF.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.setMinDocFreq">
<code class="descname">setMinDocFreq</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IDF.setMinDocFreq"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IDF.setMinDocFreq" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IDF.minDocFreq" title="pyspark.ml.feature.IDF.minDocFreq"><code class="xref py py-attr docutils literal"><span class="pre">minDocFreq</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IDF.outputCol" title="pyspark.ml.feature.IDF.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>minDocFreq=0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IDF.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IDF.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this IDF.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDF.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDF.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.IDFModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">IDFModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IDFModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IDFModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.IDF" title="pyspark.ml.feature.IDF"><code class="xref py py-class docutils literal"><span class="pre">IDF</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDFModel.idf">
<code class="descname">idf</code><a class="headerlink" href="#pyspark.ml.feature.IDFModel.idf" title="Permalink to this definition"></a></dt>
<dd><p>Returns the IDF vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IDFModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.IDFModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IDFModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IDFModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Imputer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Imputer</code><span class="sig-paren">(</span><em>strategy='mean'</em>, <em>missingValue=nan</em>, <em>inputCols=None</em>, <em>outputCols=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Imputation estimator for completing missing values, either using the mean or the median
of the columns in which the missing values are located. The input columns should be of
DoubleType or FloatType. Currently Imputer does not support categorical features and
possibly creates incorrect values for a categorical feature.</p>
<p>Note that the mean/median value is computed after filtering out missing values.
All Null values in the input columns are treated as missing, and so are also imputed. For
computing median, <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame.approxQuantile" title="pyspark.sql.DataFrame.approxQuantile"><code class="xref py py-meth docutils literal"><span class="pre">pyspark.sql.DataFrame.approxQuantile()</span></code></a> is used with a
relative error of <cite>0.001</cite>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="mf">1.0</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">)),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">)),</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span> <span class="mf">3.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">5.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">imputer</span> <span class="o">=</span> <span class="n">Imputer</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span> <span class="n">outputCols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;out_a&quot;</span><span class="p">,</span> <span class="s2">&quot;out_b&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">imputer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">surrogateDF</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+</span>
<span class="go">| a| b|</span>
<span class="go">+---+---+</span>
<span class="go">|3.0|4.0|</span>
<span class="go">+---+---+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+-----+-----+</span>
<span class="go">| a| b|out_a|out_b|</span>
<span class="go">+---+---+-----+-----+</span>
<span class="go">|1.0|NaN| 1.0| 4.0|</span>
<span class="go">|2.0|NaN| 2.0| 4.0|</span>
<span class="go">|NaN|3.0| 3.0| 3.0|</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">imputer</span><span class="o">.</span><span class="n">setStrategy</span><span class="p">(</span><span class="s2">&quot;median&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">setMissingValue</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+-----+-----+</span>
<span class="go">| a| b|out_a|out_b|</span>
<span class="go">+---+---+-----+-----+</span>
<span class="go">|1.0|NaN| 4.0| NaN|</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">imputerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/imputer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">imputer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">imputerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedImputer</span> <span class="o">=</span> <span class="n">Imputer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">imputerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedImputer</span><span class="o">.</span><span class="n">getStrategy</span><span class="p">()</span> <span class="o">==</span> <span class="n">imputer</span><span class="o">.</span><span class="n">getStrategy</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedImputer</span><span class="o">.</span><span class="n">getMissingValue</span><span class="p">()</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/imputer-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">ImputerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">out_a</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">out_a</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getInputCols">
<code class="descname">getInputCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.getInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCols or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getMissingValue">
<code class="descname">getMissingValue</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.getMissingValue"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.getMissingValue" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.missingValue" title="pyspark.ml.feature.Imputer.missingValue"><code class="xref py py-attr docutils literal"><span class="pre">missingValue</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getOutputCols">
<code class="descname">getOutputCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.getOutputCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.getOutputCols" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.outputCols" title="pyspark.ml.feature.Imputer.outputCols"><code class="xref py py-attr docutils literal"><span class="pre">outputCols</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.getStrategy">
<code class="descname">getStrategy</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.getStrategy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.getStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.strategy" title="pyspark.ml.feature.Imputer.strategy"><code class="xref py py-attr docutils literal"><span class="pre">strategy</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Imputer.inputCols">
<code class="descname">inputCols</code><em class="property"> = Param(parent='undefined', name='inputCols', doc='input column names.')</em><a class="headerlink" href="#pyspark.ml.feature.Imputer.inputCols" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Imputer.missingValue">
<code class="descname">missingValue</code><em class="property"> = Param(parent='undefined', name='missingValue', doc='The placeholder for the missing values. All occurrences of missingValue will be imputed.')</em><a class="headerlink" href="#pyspark.ml.feature.Imputer.missingValue" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Imputer.outputCols">
<code class="descname">outputCols</code><em class="property"> = Param(parent='undefined', name='outputCols', doc='output column names.')</em><a class="headerlink" href="#pyspark.ml.feature.Imputer.outputCols" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Imputer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Imputer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.setInputCols">
<code class="descname">setInputCols</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.setInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.inputCols" title="pyspark.ml.feature.Imputer.inputCols"><code class="xref py py-attr docutils literal"><span class="pre">inputCols</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.setMissingValue">
<code class="descname">setMissingValue</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.setMissingValue"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.setMissingValue" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.missingValue" title="pyspark.ml.feature.Imputer.missingValue"><code class="xref py py-attr docutils literal"><span class="pre">missingValue</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.setOutputCols">
<code class="descname">setOutputCols</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.setOutputCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.setOutputCols" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.outputCols" title="pyspark.ml.feature.Imputer.outputCols"><code class="xref py py-attr docutils literal"><span class="pre">outputCols</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>strategy=&quot;mean&quot;</em>, <em>missingValue=float(&quot;nan&quot;)</em>, <em>inputCols=None</em>, <em>outputCols=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Imputer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.setStrategy">
<code class="descname">setStrategy</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Imputer.setStrategy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Imputer.setStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Imputer.strategy" title="pyspark.ml.feature.Imputer.strategy"><code class="xref py py-attr docutils literal"><span class="pre">strategy</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Imputer.strategy">
<code class="descname">strategy</code><em class="property"> = Param(parent='undefined', name='strategy', doc='strategy for imputation. If mean, then replace missing values using the mean value of the feature. If median, then replace missing values using the median value of the feature.')</em><a class="headerlink" href="#pyspark.ml.feature.Imputer.strategy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Imputer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Imputer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.ImputerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">ImputerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#ImputerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.ImputerModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.Imputer" title="pyspark.ml.feature.Imputer"><code class="xref py py-class docutils literal"><span class="pre">Imputer</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ImputerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.ImputerModel.surrogateDF">
<code class="descname">surrogateDF</code><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.surrogateDF" title="Permalink to this definition"></a></dt>
<dd><p>Returns a DataFrame containing inputCols and their corresponding surrogates,
which are used to replace the missing values in the input DataFrame.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.ImputerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.ImputerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.IndexToString">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">IndexToString</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>labels=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IndexToString"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IndexToString" title="Permalink to this definition"></a></dt>
<dd><p>A <code class="xref py py-class docutils literal"><span class="pre">Transformer</span></code> that maps a column of indices back to a new column of
corresponding string values.
The index-string mapping is either from the ML attributes of the input column,
or from user-supplied labels (which take precedence over ML attributes).
See <a class="reference internal" href="#pyspark.ml.feature.StringIndexer" title="pyspark.ml.feature.StringIndexer"><code class="xref py py-class docutils literal"><span class="pre">StringIndexer</span></code></a> for converting strings into indices.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.getLabels">
<code class="descname">getLabels</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IndexToString.getLabels"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IndexToString.getLabels" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.IndexToString.labels" title="pyspark.ml.feature.IndexToString.labels"><code class="xref py py-attr docutils literal"><span class="pre">labels</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IndexToString.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.IndexToString.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IndexToString.labels">
<code class="descname">labels</code><em class="property"> = Param(parent='undefined', name='labels', doc='Optional array of labels specifying index-string mapping. If not provided or if empty, then metadata from inputCol is used instead.')</em><a class="headerlink" href="#pyspark.ml.feature.IndexToString.labels" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IndexToString.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.IndexToString.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.IndexToString.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.IndexToString.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IndexToString.inputCol" title="pyspark.ml.feature.IndexToString.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.setLabels">
<code class="descname">setLabels</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IndexToString.setLabels"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IndexToString.setLabels" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IndexToString.labels" title="pyspark.ml.feature.IndexToString.labels"><code class="xref py py-attr docutils literal"><span class="pre">labels</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.IndexToString.outputCol" title="pyspark.ml.feature.IndexToString.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>labels=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#IndexToString.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.IndexToString.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this IndexToString.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.IndexToString.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.IndexToString.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MaxAbsScaler">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MaxAbsScaler</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MaxAbsScaler"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler" title="Permalink to this definition"></a></dt>
<dd><p>Rescale each feature individually to range [-1, 1] by dividing through the largest maximum
absolute value in each feature. It does not shift/center the data, and thus does not destroy
any sparsity.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">maScaler</span> <span class="o">=</span> <span class="n">MaxAbsScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;scaled&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">maScaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+-----+------+</span>
<span class="go">| a|scaled|</span>
<span class="go">+-----+------+</span>
<span class="go">|[1.0]| [0.5]|</span>
<span class="go">|[2.0]| [1.0]|</span>
<span class="go">+-----+------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scalerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/max-abs-scaler&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">maScaler</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">scalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMAScaler</span> <span class="o">=</span> <span class="n">MaxAbsScaler</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">scalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMAScaler</span><span class="o">.</span><span class="n">getInputCol</span><span class="p">()</span> <span class="o">==</span> <span class="n">maScaler</span><span class="o">.</span><span class="n">getInputCol</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMAScaler</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">()</span> <span class="o">==</span> <span class="n">maScaler</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/max-abs-scaler-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">MaxAbsScalerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">maxAbs</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">maxAbs</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MaxAbsScaler.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MaxAbsScaler.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MaxAbsScaler.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MaxAbsScaler.inputCol" title="pyspark.ml.feature.MaxAbsScaler.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MaxAbsScaler.outputCol" title="pyspark.ml.feature.MaxAbsScaler.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MaxAbsScaler.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this MaxAbsScaler.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScaler.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScaler.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MaxAbsScalerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MaxAbsScalerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MaxAbsScalerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.MaxAbsScaler" title="pyspark.ml.feature.MaxAbsScaler"><code class="xref py py-class docutils literal"><span class="pre">MaxAbsScaler</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.maxAbs">
<code class="descname">maxAbs</code><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.maxAbs" title="Permalink to this definition"></a></dt>
<dd><p>Max Abs vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MaxAbsScalerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MaxAbsScalerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MinHashLSH">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MinHashLSH</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>seed=None</em>, <em>numHashTables=1</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinHashLSH"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>LSH class for Jaccard distance.
The input can be dense or sparse vectors, but it is more efficient if it is sparse.
For example, <cite>Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])</cite> means there are 10 elements
in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at
least 1 non-zero index, and all non-zero values are treated as binary “1” values.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a></p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="k">import</span> <span class="n">col</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mh</span> <span class="o">=</span> <span class="n">MinHashLSH</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;hashes&quot;</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">12345</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">mh</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([-1638925...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data2</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">3</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data2</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">key</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">approxNearestNeighbors</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([-163892...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">approxSimilarityJoin</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="mf">0.6</span><span class="p">,</span> <span class="n">distCol</span><span class="o">=</span><span class="s2">&quot;JaccardDistance&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;datasetA.id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;idA&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;datasetB.id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;idB&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;JaccardDistance&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+---------------+</span>
<span class="go">|idA|idB|JaccardDistance|</span>
<span class="go">+---+---+---------------+</span>
<span class="go">| 1| 4| 0.5|</span>
<span class="go">| 0| 5| 0.5|</span>
<span class="go">+---+---+---------------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mhPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/mh&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mh</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">mhPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mh2</span> <span class="o">=</span> <span class="n">MinHashLSH</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">mhPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mh2</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">()</span> <span class="o">==</span> <span class="n">mh</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/mh-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">MinHashLSHModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">hashes</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">hashes</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getNumHashTables">
<code class="descname">getNumHashTables</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getNumHashTables" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numHashTables or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSH.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSH.numHashTables">
<code class="descname">numHashTables</code><em class="property"> = Param(parent='undefined', name='numHashTables', doc='number of hash tables, where increasing number of hash tables lowers the false negative rate, and decreasing it improves the running performance.')</em><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.numHashTables" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSH.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSH.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSH.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinHashLSH.inputCol" title="pyspark.ml.feature.MinHashLSH.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.setNumHashTables">
<code class="descname">setNumHashTables</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.setNumHashTables" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinHashLSH.numHashTables" title="pyspark.ml.feature.MinHashLSH.numHashTables"><code class="xref py py-attr docutils literal"><span class="pre">numHashTables</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinHashLSH.outputCol" title="pyspark.ml.feature.MinHashLSH.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>seed=None</em>, <em>numHashTables=1</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinHashLSH.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this MinHashLSH.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinHashLSH.seed" title="pyspark.ml.feature.MinHashLSH.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSH.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSH.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MinHashLSHModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MinHashLSHModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinHashLSHModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model produced by <a class="reference internal" href="#pyspark.ml.feature.MinHashLSH" title="pyspark.ml.feature.MinHashLSH"><code class="xref py py-class docutils literal"><span class="pre">MinHashLSH</span></code></a>, where where multiple hash functions are stored. Each
hash function is picked from the following family of hash functions, where <span class="math">\(a_i\)</span> and
<span class="math">\(b_i\)</span> are randomly chosen integers less than prime:
<span class="math">\(h_i(x) = ((x \cdot a_i + b_i) \mod prime)\)</span> This hash family is approximately min-wise
independent according to the reference.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last">Tom Bohman, Colin Cooper, and Alan Frieze. “Min-wise independent linear permutations.” Electronic Journal of Combinatorics 7 (2000): R26.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.approxNearestNeighbors">
<code class="descname">approxNearestNeighbors</code><span class="sig-paren">(</span><em>dataset</em>, <em>key</em>, <em>numNearestNeighbors</em>, <em>distCol='distCol'</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.approxNearestNeighbors" title="Permalink to this definition"></a></dt>
<dd><p>Given a large dataset and an item, approximately find at most k items which have the
closest distance to the item. If the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> is missing, the method will
transform the data; if the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> exists, it will use that. This allows
caching of the transformed data when necessary.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This method is experimental and will likely change behavior in the next release.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – The dataset to search for nearest neighbors of the key.</li>
<li><strong>key</strong> – Feature vector representing the item to search for.</li>
<li><strong>numNearestNeighbors</strong> – The maximum number of nearest neighbors.</li>
<li><strong>distCol</strong> – Output column for storing the distance between each result row and the key.
Use “distCol” as default value if it’s not specified.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dataset containing at most k items closest to the key. A column “distCol” is
added to show the distance between each row and the key.</p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.approxSimilarityJoin">
<code class="descname">approxSimilarityJoin</code><span class="sig-paren">(</span><em>datasetA</em>, <em>datasetB</em>, <em>threshold</em>, <em>distCol='distCol'</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.approxSimilarityJoin" title="Permalink to this definition"></a></dt>
<dd><p>Join two datasets to approximately find all pairs of rows whose distance are smaller than
the threshold. If the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> is missing, the method will transform the data;
if the <code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code> exists, it will use that. This allows caching of the
transformed data when necessary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>datasetA</strong> – One of the datasets to join.</li>
<li><strong>datasetB</strong> – Another dataset to join.</li>
<li><strong>threshold</strong> – The threshold for the distance of row pairs.</li>
<li><strong>distCol</strong> – Output column for storing the distance between each pair of rows. Use
“distCol” as default value if it’s not specified.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A joined dataset containing pairs of rows. The original rows are in columns
“datasetA” and “datasetB”, and a column “distCol” is added to show the distance
between each pair.</p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinHashLSHModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinHashLSHModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinHashLSHModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MinMaxScaler">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MinMaxScaler</code><span class="sig-paren">(</span><em>min=0.0</em>, <em>max=1.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler" title="Permalink to this definition"></a></dt>
<dd><p>Rescale each feature individually to a common range [min, max] linearly using column summary
statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
feature E is calculated as,</p>
<p>Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min</p>
<p>For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Since zero values will probably be transformed to non-zero values, output of the
transformer will be DenseVector even for sparse input.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mmScaler</span> <span class="o">=</span> <span class="n">MinMaxScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;scaled&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">mmScaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">originalMin</span>
<span class="go">DenseVector([0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">originalMax</span>
<span class="go">DenseVector([2.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+-----+------+</span>
<span class="go">| a|scaled|</span>
<span class="go">+-----+------+</span>
<span class="go">|[0.0]| [0.0]|</span>
<span class="go">|[2.0]| [1.0]|</span>
<span class="go">+-----+------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">minMaxScalerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/min-max-scaler&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mmScaler</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">minMaxScalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMMScaler</span> <span class="o">=</span> <span class="n">MinMaxScaler</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">minMaxScalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMMScaler</span><span class="o">.</span><span class="n">getMin</span><span class="p">()</span> <span class="o">==</span> <span class="n">mmScaler</span><span class="o">.</span><span class="n">getMin</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedMMScaler</span><span class="o">.</span><span class="n">getMax</span><span class="p">()</span> <span class="o">==</span> <span class="n">mmScaler</span><span class="o">.</span><span class="n">getMax</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/min-max-scaler-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">MinMaxScalerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">originalMin</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">originalMin</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">originalMax</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">originalMax</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getMax">
<code class="descname">getMax</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler.getMax"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getMax" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of max or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getMin">
<code class="descname">getMin</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler.getMin"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getMin" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of min or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScaler.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScaler.max">
<code class="descname">max</code><em class="property"> = Param(parent='undefined', name='max', doc='Upper bound of the output feature range')</em><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.max" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScaler.min">
<code class="descname">min</code><em class="property"> = Param(parent='undefined', name='min', doc='Lower bound of the output feature range')</em><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.min" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScaler.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScaler.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinMaxScaler.inputCol" title="pyspark.ml.feature.MinMaxScaler.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.setMax">
<code class="descname">setMax</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler.setMax"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.setMax" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinMaxScaler.max" title="pyspark.ml.feature.MinMaxScaler.max"><code class="xref py py-attr docutils literal"><span class="pre">max</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.setMin">
<code class="descname">setMin</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler.setMin"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.setMin" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinMaxScaler.min" title="pyspark.ml.feature.MinMaxScaler.min"><code class="xref py py-attr docutils literal"><span class="pre">min</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.MinMaxScaler.outputCol" title="pyspark.ml.feature.MinMaxScaler.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>min=0.0</em>, <em>max=1.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScaler.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this MinMaxScaler.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScaler.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScaler.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.MinMaxScalerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">MinMaxScalerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#MinMaxScalerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.MinMaxScaler" title="pyspark.ml.feature.MinMaxScaler"><code class="xref py py-class docutils literal"><span class="pre">MinMaxScaler</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScalerModel.originalMax">
<code class="descname">originalMax</code><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.originalMax" title="Permalink to this definition"></a></dt>
<dd><p>Max value for each original column during fitting.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScalerModel.originalMin">
<code class="descname">originalMin</code><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.originalMin" title="Permalink to this definition"></a></dt>
<dd><p>Min value for each original column during fitting.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.MinMaxScalerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.MinMaxScalerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.MinMaxScalerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.NGram">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">NGram</code><span class="sig-paren">(</span><em>n=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#NGram"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.NGram" title="Permalink to this definition"></a></dt>
<dd><p>A feature transformer that converts the input array of strings into an array of n-grams. Null
values in the input array are ignored.
It returns an array of n-grams where each n-gram is represented by a space-separated string of
words.
When the input is empty, an empty array is returned.
When the input array length is less than n (number of elements per n-gram), no n-grams are
returned.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">inputTokens</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">,</span> <span class="s2">&quot;e&quot;</span><span class="p">])])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span> <span class="o">=</span> <span class="n">NGram</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;inputTokens&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;nGrams&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b&#39;, &#39;b c&#39;, &#39;c d&#39;, &#39;d e&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Change n-gram length</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Temporarily modify output column.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">{</span><span class="n">ngram</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;output&quot;</span><span class="p">})</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], output=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Must use keyword arguments to specify params.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="s2">&quot;text&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">TypeError</span>: <span class="n">Method setParams forces keyword arguments.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngramPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/ngram&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ngram</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">ngramPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedNGram</span> <span class="o">=</span> <span class="n">NGram</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">ngramPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedNGram</span><span class="o">.</span><span class="n">getN</span><span class="p">()</span> <span class="o">==</span> <span class="n">ngram</span><span class="o">.</span><span class="n">getN</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.getN">
<code class="descname">getN</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#NGram.getN"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.NGram.getN" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of n or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.NGram.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.NGram.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.NGram.n">
<code class="descname">n</code><em class="property"> = Param(parent='undefined', name='n', doc='number of elements per n-gram (&gt;=1)')</em><a class="headerlink" href="#pyspark.ml.feature.NGram.n" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.NGram.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.NGram.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.NGram.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.NGram.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.NGram.inputCol" title="pyspark.ml.feature.NGram.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.setN">
<code class="descname">setN</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#NGram.setN"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.NGram.setN" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.NGram.n" title="pyspark.ml.feature.NGram.n"><code class="xref py py-attr docutils literal"><span class="pre">n</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.NGram.outputCol" title="pyspark.ml.feature.NGram.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>n=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#NGram.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.NGram.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this NGram.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.NGram.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.NGram.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Normalizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Normalizer</code><span class="sig-paren">(</span><em>p=2.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Normalizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Normalizer" title="Permalink to this definition"></a></dt>
<dd><blockquote>
<div>Normalize a vector to have unit norm using the given p-norm.</div></blockquote>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svec</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mi">3</span><span class="p">:</span> <span class="mf">3.0</span><span class="p">})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">4.0</span><span class="p">]),</span> <span class="n">svec</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;dense&quot;</span><span class="p">,</span> <span class="s2">&quot;sparse&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizer</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;dense&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">features</span>
<span class="go">DenseVector([0.6, -0.8])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;sparse&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">SparseVector(4, {1: 0.8, 3: 0.6})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">normalizer</span><span class="o">.</span><span class="n">p</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">inputCol</span><span class="p">:</span> <span class="s2">&quot;dense&quot;</span><span class="p">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">DenseVector([0.4286, -0.5714])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/normalizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">normalizer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">normalizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedNormalizer</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">normalizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedNormalizer</span><span class="o">.</span><span class="n">getP</span><span class="p">()</span> <span class="o">==</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">getP</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.getP">
<code class="descname">getP</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Normalizer.getP"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Normalizer.getP" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of p or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Normalizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Normalizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Normalizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Normalizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Normalizer.p">
<code class="descname">p</code><em class="property"> = Param(parent='undefined', name='p', doc='the p norm value.')</em><a class="headerlink" href="#pyspark.ml.feature.Normalizer.p" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Normalizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Normalizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Normalizer.inputCol" title="pyspark.ml.feature.Normalizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Normalizer.outputCol" title="pyspark.ml.feature.Normalizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.setP">
<code class="descname">setP</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Normalizer.setP"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Normalizer.setP" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Normalizer.p" title="pyspark.ml.feature.Normalizer.p"><code class="xref py py-attr docutils literal"><span class="pre">p</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>p=2.0</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Normalizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Normalizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Normalizer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Normalizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Normalizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.OneHotEncoder">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">OneHotEncoder</code><span class="sig-paren">(</span><em>dropLast=True</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#OneHotEncoder"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder" title="Permalink to this definition"></a></dt>
<dd><p>A one-hot encoder that maps a column of category indices to a
column of binary vectors, with at most a single one-value per row
that indicates the input category index.
For example with 5 categories, an input value of 2.0 would map to
an output vector of <cite>[0.0, 0.0, 1.0, 0.0]</cite>.
The last category is not included by default (configurable via
<a class="reference internal" href="#pyspark.ml.feature.OneHotEncoder.dropLast" title="pyspark.ml.feature.OneHotEncoder.dropLast"><code class="xref py py-attr docutils literal"><span class="pre">dropLast</span></code></a>) because it makes the vector entries sum up to
one, and hence linearly dependent.
So an input value of 4.0 maps to <cite>[0.0, 0.0, 0.0, 0.0]</cite>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This is different from scikit-learn’s OneHotEncoder,
which keeps all categories. The output vectors are sparse.</p>
</div>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.feature.StringIndexer" title="pyspark.ml.feature.StringIndexer"><code class="xref py py-class docutils literal"><span class="pre">StringIndexer</span></code></a> for converting categorical values into
category indices</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">encoder</span> <span class="o">=</span> <span class="n">OneHotEncoder</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">td</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">features</span>
<span class="go">SparseVector(2, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">encoder</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">td</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">SparseVector(2, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">encoder</span><span class="o">.</span><span class="n">dropLast</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span> <span class="n">encoder</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;test&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">td</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">test</span>
<span class="go">SparseVector(3, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">onehotEncoderPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/onehot-encoder&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">encoder</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">onehotEncoderPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedEncoder</span> <span class="o">=</span> <span class="n">OneHotEncoder</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">onehotEncoderPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedEncoder</span><span class="o">.</span><span class="n">getDropLast</span><span class="p">()</span> <span class="o">==</span> <span class="n">encoder</span><span class="o">.</span><span class="n">getDropLast</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.OneHotEncoder.dropLast">
<code class="descname">dropLast</code><em class="property"> = Param(parent='undefined', name='dropLast', doc='whether to drop the last category')</em><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.dropLast" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.getDropLast">
<code class="descname">getDropLast</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#OneHotEncoder.getDropLast"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.getDropLast" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of dropLast or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.OneHotEncoder.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.OneHotEncoder.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.OneHotEncoder.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.setDropLast">
<code class="descname">setDropLast</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#OneHotEncoder.setDropLast"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.setDropLast" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.OneHotEncoder.dropLast" title="pyspark.ml.feature.OneHotEncoder.dropLast"><code class="xref py py-attr docutils literal"><span class="pre">dropLast</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.OneHotEncoder.inputCol" title="pyspark.ml.feature.OneHotEncoder.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.OneHotEncoder.outputCol" title="pyspark.ml.feature.OneHotEncoder.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>dropLast=True</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#OneHotEncoder.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this OneHotEncoder.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.OneHotEncoder.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.OneHotEncoder.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.PCA">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">PCA</code><span class="sig-paren">(</span><em>k=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PCA"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PCA" title="Permalink to this definition"></a></dt>
<dd><p>PCA trains a model to project vectors to a lower dimensional space of the
top <a class="reference internal" href="#pyspark.ml.feature.PCA.k" title="pyspark.ml.feature.PCA.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a> principal components.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">)]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pca</span> <span class="o">=</span> <span class="n">PCA</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;pca_features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">pca_features</span>
<span class="go">DenseVector([1.648..., -4.013...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">explainedVariance</span>
<span class="go">DenseVector([0.794..., 0.205...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pcaPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/pca&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pca</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">pcaPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedPca</span> <span class="o">=</span> <span class="n">PCA</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">pcaPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedPca</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span> <span class="o">==</span> <span class="n">pca</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/pca-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">PCAModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">pc</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">pc</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">explainedVariance</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">explainedVariance</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.getK">
<code class="descname">getK</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PCA.getK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PCA.getK" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of k or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCA.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.PCA.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCA.k">
<code class="descname">k</code><em class="property"> = Param(parent='undefined', name='k', doc='the number of principal components')</em><a class="headerlink" href="#pyspark.ml.feature.PCA.k" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCA.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.PCA.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCA.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.PCA.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PCA.inputCol" title="pyspark.ml.feature.PCA.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PCA.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PCA.setK" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PCA.k" title="pyspark.ml.feature.PCA.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PCA.outputCol" title="pyspark.ml.feature.PCA.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>k=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PCA.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PCA.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Set params for this PCA.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCA.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCA.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.PCAModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">PCAModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PCAModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PCAModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.PCA" title="pyspark.ml.feature.PCA"><code class="xref py py-class docutils literal"><span class="pre">PCA</span></code></a>. Transforms vectors to a lower dimensional space.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCAModel.explainedVariance">
<code class="descname">explainedVariance</code><a class="headerlink" href="#pyspark.ml.feature.PCAModel.explainedVariance" title="Permalink to this definition"></a></dt>
<dd><p>Returns a vector of proportions of variance
explained by each principal component.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCAModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.PCAModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PCAModel.pc">
<code class="descname">pc</code><a class="headerlink" href="#pyspark.ml.feature.PCAModel.pc" title="Permalink to this definition"></a></dt>
<dd><p>Returns a principal components Matrix.
Each column is one principal component.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PCAModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PCAModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.PolynomialExpansion">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">PolynomialExpansion</code><span class="sig-paren">(</span><em>degree=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PolynomialExpansion"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion" title="Permalink to this definition"></a></dt>
<dd><p>Perform feature expansion in a polynomial space. As said in <a class="reference external" href="http://en.wikipedia.org/wiki/Polynomial_expansion">wikipedia of Polynomial Expansion</a>, “In mathematics, an
expansion of a product of sums expresses it as a sum of products by using the fact that
multiplication distributes over addition”. Take a 2-variable feature vector as an example:
<cite>(x, y)</cite>, if we want to expand it with degree 2, then we get <cite>(x, x * x, y, x * y, y * y)</cite>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;dense&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">px</span> <span class="o">=</span> <span class="n">PolynomialExpansion</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;dense&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;expanded&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">px</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">expanded</span>
<span class="go">DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">px</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;test&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">test</span>
<span class="go">DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">polyExpansionPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/poly-expansion&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">px</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">polyExpansionPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedPx</span> <span class="o">=</span> <span class="n">PolynomialExpansion</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">polyExpansionPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedPx</span><span class="o">.</span><span class="n">getDegree</span><span class="p">()</span> <span class="o">==</span> <span class="n">px</span><span class="o">.</span><span class="n">getDegree</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PolynomialExpansion.degree">
<code class="descname">degree</code><em class="property"> = Param(parent='undefined', name='degree', doc='the polynomial degree to expand (&gt;= 1)')</em><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.degree" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.getDegree">
<code class="descname">getDegree</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PolynomialExpansion.getDegree"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.getDegree" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of degree or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PolynomialExpansion.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PolynomialExpansion.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.PolynomialExpansion.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.setDegree">
<code class="descname">setDegree</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PolynomialExpansion.setDegree"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.setDegree" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PolynomialExpansion.degree" title="pyspark.ml.feature.PolynomialExpansion.degree"><code class="xref py py-attr docutils literal"><span class="pre">degree</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PolynomialExpansion.inputCol" title="pyspark.ml.feature.PolynomialExpansion.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.PolynomialExpansion.outputCol" title="pyspark.ml.feature.PolynomialExpansion.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>degree=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#PolynomialExpansion.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this PolynomialExpansion.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.PolynomialExpansion.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.PolynomialExpansion.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.QuantileDiscretizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">QuantileDiscretizer</code><span class="sig-paren">(</span><em>numBuckets=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>relativeError=0.001</em>, <em>handleInvalid='error'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p><cite>QuantileDiscretizer</cite> takes a column with continuous features and outputs a column with binned
categorical features. The number of bins can be set using the <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.numBuckets" title="pyspark.ml.feature.QuantileDiscretizer.numBuckets"><code class="xref py py-attr docutils literal"><span class="pre">numBuckets</span></code></a> parameter.
It is possible that the number of buckets used will be less than this value, for example, if
there are too few distinct values of the input to create enough distinct quantiles.</p>
<p>NaN handling: Note also that
QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user
can also choose to either keep or remove NaN values within the dataset by setting
<a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.handleInvalid" title="pyspark.ml.feature.QuantileDiscretizer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a> parameter. If the user chooses to keep NaN values, they will be
handled specially and placed into their own bucket, for example, if 4 buckets are used, then
non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].</p>
<p>Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
<a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.approxQuantile" title="pyspark.sql.DataFrameStatFunctions.approxQuantile"><code class="xref py py-meth docutils literal"><span class="pre">approxQuantile()</span></code></a> for a detailed description).
The precision of the approximation can be controlled with the
<a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.relativeError" title="pyspark.ml.feature.QuantileDiscretizer.relativeError"><code class="xref py py-attr docutils literal"><span class="pre">relativeError</span></code></a> parameter.
The lower and upper bin bounds will be <cite>-Infinity</cite> and <cite>+Infinity</cite>, covering all real values.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">values</span> <span class="o">=</span> <span class="p">[(</span><span class="mf">0.1</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.4</span><span class="p">,),</span> <span class="p">(</span><span class="mf">1.2</span><span class="p">,),</span> <span class="p">(</span><span class="mf">1.5</span><span class="p">,),</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),),</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;values&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">qds</span> <span class="o">=</span> <span class="n">QuantileDiscretizer</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;values&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;buckets&quot;</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">qds</span><span class="o">.</span><span class="n">getRelativeError</span><span class="p">()</span>
<span class="go">0.01</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketizer</span> <span class="o">=</span> <span class="n">qds</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">qds</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;keep&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">6</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">qds</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;skip&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="go">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">splits</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">splits</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">-inf</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%2.1f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="nb">round</span><span class="p">(</span><span class="n">splits</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="mi">1</span><span class="p">))</span>
<span class="go">0.4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bucketed</span><span class="o">.</span><span class="n">buckets</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">quantileDiscretizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/quantile-discretizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">qds</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">quantileDiscretizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedQds</span> <span class="o">=</span> <span class="n">QuantileDiscretizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">quantileDiscretizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedQds</span><span class="o">.</span><span class="n">getNumBuckets</span><span class="p">()</span> <span class="o">==</span> <span class="n">qds</span><span class="o">.</span><span class="n">getNumBuckets</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getHandleInvalid">
<code class="descname">getHandleInvalid</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.getHandleInvalid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.handleInvalid" title="pyspark.ml.feature.QuantileDiscretizer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getNumBuckets">
<code class="descname">getNumBuckets</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.getNumBuckets"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getNumBuckets" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numBuckets or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.getRelativeError">
<code class="descname">getRelativeError</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.getRelativeError"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.getRelativeError" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of relativeError or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.handleInvalid">
<code class="descname">handleInvalid</code><em class="property"> = Param(parent='undefined', name='handleInvalid', doc='how to handle invalid entries. Options are skip (filter out rows with invalid values), error (throw an error), or keep (keep invalid values in a special additional bucket).')</em><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.handleInvalid" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.numBuckets">
<code class="descname">numBuckets</code><em class="property"> = Param(parent='undefined', name='numBuckets', doc='Maximum number of buckets (quantiles, or categories) into which data points are grouped. Must be &gt;= 2.')</em><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.numBuckets" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.QuantileDiscretizer.relativeError">
<code class="descname">relativeError</code><em class="property"> = Param(parent='undefined', name='relativeError', doc='The relative target precision for the approximate quantile algorithm used to generate buckets. Must be in the range [0, 1].')</em><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.relativeError" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setHandleInvalid">
<code class="descname">setHandleInvalid</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.setHandleInvalid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.handleInvalid" title="pyspark.ml.feature.QuantileDiscretizer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.inputCol" title="pyspark.ml.feature.QuantileDiscretizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setNumBuckets">
<code class="descname">setNumBuckets</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.setNumBuckets"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setNumBuckets" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.numBuckets" title="pyspark.ml.feature.QuantileDiscretizer.numBuckets"><code class="xref py py-attr docutils literal"><span class="pre">numBuckets</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.outputCol" title="pyspark.ml.feature.QuantileDiscretizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>numBuckets=2</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>relativeError=0.001</em>, <em>handleInvalid=&quot;error&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Set the params for the QuantileDiscretizer</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.setRelativeError">
<code class="descname">setRelativeError</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#QuantileDiscretizer.setRelativeError"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.setRelativeError" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.QuantileDiscretizer.relativeError" title="pyspark.ml.feature.QuantileDiscretizer.relativeError"><code class="xref py py-attr docutils literal"><span class="pre">relativeError</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.QuantileDiscretizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.QuantileDiscretizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.RegexTokenizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">RegexTokenizer</code><span class="sig-paren">(</span><em>minTokenLength=1</em>, <em>gaps=True</em>, <em>pattern='\s+'</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>toLowercase=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer" title="Permalink to this definition"></a></dt>
<dd><p>A regex based tokenizer that extracts tokens either by using the
provided regex pattern (in Java dialect) to split the text
(default) or repeatedly matching the regex (if gaps is false).
Optional parameters also allow filtering tokens using a minimal
length.
It returns an array of strings that can be empty.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;A B c&quot;</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;text&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;text&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;words&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Change a parameter.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;tokens&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Temporarily modify a parameter.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">{</span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;words&quot;</span><span class="p">})</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Must use keyword arguments to specify params.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="s2">&quot;text&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">TypeError</span>: <span class="n">Method setParams forces keyword arguments.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">regexTokenizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/regex-tokenizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reTokenizer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">regexTokenizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedReTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">regexTokenizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedReTokenizer</span><span class="o">.</span><span class="n">getMinTokenLength</span><span class="p">()</span> <span class="o">==</span> <span class="n">reTokenizer</span><span class="o">.</span><span class="n">getMinTokenLength</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedReTokenizer</span><span class="o">.</span><span class="n">getGaps</span><span class="p">()</span> <span class="o">==</span> <span class="n">reTokenizer</span><span class="o">.</span><span class="n">getGaps</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.gaps">
<code class="descname">gaps</code><em class="property"> = Param(parent='undefined', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.gaps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getGaps">
<code class="descname">getGaps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.getGaps"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getGaps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of gaps or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getMinTokenLength">
<code class="descname">getMinTokenLength</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.getMinTokenLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getMinTokenLength" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minTokenLength or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getPattern">
<code class="descname">getPattern</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.getPattern"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getPattern" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of pattern or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.getToLowercase">
<code class="descname">getToLowercase</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.getToLowercase"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.getToLowercase" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of toLowercase or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.minTokenLength">
<code class="descname">minTokenLength</code><em class="property"> = Param(parent='undefined', name='minTokenLength', doc='minimum token length (&gt;= 0)')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.minTokenLength" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.pattern">
<code class="descname">pattern</code><em class="property"> = Param(parent='undefined', name='pattern', doc='regex pattern (Java dialect) used for tokenizing')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.pattern" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setGaps">
<code class="descname">setGaps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.setGaps"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setGaps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.gaps" title="pyspark.ml.feature.RegexTokenizer.gaps"><code class="xref py py-attr docutils literal"><span class="pre">gaps</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.inputCol" title="pyspark.ml.feature.RegexTokenizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setMinTokenLength">
<code class="descname">setMinTokenLength</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.setMinTokenLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setMinTokenLength" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.minTokenLength" title="pyspark.ml.feature.RegexTokenizer.minTokenLength"><code class="xref py py-attr docutils literal"><span class="pre">minTokenLength</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.outputCol" title="pyspark.ml.feature.RegexTokenizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>minTokenLength=1</em>, <em>gaps=True</em>, <em>pattern=&quot;s+&quot;</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>toLowercase=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this RegexTokenizer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setPattern">
<code class="descname">setPattern</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.setPattern"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setPattern" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.pattern" title="pyspark.ml.feature.RegexTokenizer.pattern"><code class="xref py py-attr docutils literal"><span class="pre">pattern</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.setToLowercase">
<code class="descname">setToLowercase</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RegexTokenizer.setToLowercase"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.setToLowercase" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RegexTokenizer.toLowercase" title="pyspark.ml.feature.RegexTokenizer.toLowercase"><code class="xref py py-attr docutils literal"><span class="pre">toLowercase</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RegexTokenizer.toLowercase">
<code class="descname">toLowercase</code><em class="property"> = Param(parent='undefined', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing')</em><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.toLowercase" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RegexTokenizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RegexTokenizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.RFormula">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">RFormula</code><span class="sig-paren">(</span><em>formula=None</em>, <em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>forceIndexLabel=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Implements the transforms required for fitting a dataset against an
R model formula. Currently we support a limited subset of the R
operators, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. Also see the <a class="reference external" href="http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html">R formula docs</a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;y&quot;</span><span class="p">,</span> <span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span> <span class="o">=</span> <span class="n">RFormula</span><span class="p">(</span><span class="n">formula</span><span class="o">=</span><span class="s2">&quot;y ~ x + s&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">rf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="go">| y| x| s| features|label|</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="go">|1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="go">|0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="go">|0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">{</span><span class="n">rf</span><span class="o">.</span><span class="n">formula</span><span class="p">:</span> <span class="s2">&quot;y ~ . - s&quot;</span><span class="p">})</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+---+--------+-----+</span>
<span class="go">| y| x| s|features|label|</span>
<span class="go">+---+---+---+--------+-----+</span>
<span class="go">|1.0|1.0| a| [1.0]| 1.0|</span>
<span class="go">|0.0|2.0| b| [2.0]| 0.0|</span>
<span class="go">|0.0|0.0| a| [0.0]| 0.0|</span>
<span class="go">+---+---+---+--------+-----+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rFormulaPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rFormula&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">rFormulaPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRF</span> <span class="o">=</span> <span class="n">RFormula</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">rFormulaPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRF</span><span class="o">.</span><span class="n">getFormula</span><span class="p">()</span> <span class="o">==</span> <span class="n">rf</span><span class="o">.</span><span class="n">getFormula</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRF</span><span class="o">.</span><span class="n">getFeaturesCol</span><span class="p">()</span> <span class="o">==</span> <span class="n">rf</span><span class="o">.</span><span class="n">getFeaturesCol</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRF</span><span class="o">.</span><span class="n">getLabelCol</span><span class="p">()</span> <span class="o">==</span> <span class="n">rf</span><span class="o">.</span><span class="n">getLabelCol</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">str</span><span class="p">(</span><span class="n">loadedRF</span><span class="p">)</span>
<span class="go">&#39;RFormula(y ~ x + s) (uid=...)&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rFormulaModel&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">RFormulaModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">uid</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">uid</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="go">| y| x| s| features|label|</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="go">|1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="go">|0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="go">|0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="go">+---+---+---+---------+-----+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">str</span><span class="p">(</span><span class="n">loadedModel</span><span class="p">)</span>
<span class="go">&#39;RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormula.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.feature.RFormula.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormula.forceIndexLabel">
<code class="descname">forceIndexLabel</code><em class="property"> = Param(parent='undefined', name='forceIndexLabel', doc='Force to index label whether it is numeric or string')</em><a class="headerlink" href="#pyspark.ml.feature.RFormula.forceIndexLabel" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormula.formula">
<code class="descname">formula</code><em class="property"> = Param(parent='undefined', name='formula', doc='R model formula')</em><a class="headerlink" href="#pyspark.ml.feature.RFormula.formula" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getForceIndexLabel">
<code class="descname">getForceIndexLabel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula.getForceIndexLabel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula.getForceIndexLabel" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.forceIndexLabel" title="pyspark.ml.feature.RFormula.forceIndexLabel"><code class="xref py py-attr docutils literal"><span class="pre">forceIndexLabel</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getFormula">
<code class="descname">getFormula</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula.getFormula"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula.getFormula" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.formula" title="pyspark.ml.feature.RFormula.formula"><code class="xref py py-attr docutils literal"><span class="pre">formula</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormula.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.feature.RFormula.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormula.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.RFormula.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.featuresCol" title="pyspark.ml.feature.RFormula.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.setForceIndexLabel">
<code class="descname">setForceIndexLabel</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula.setForceIndexLabel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula.setForceIndexLabel" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.forceIndexLabel" title="pyspark.ml.feature.RFormula.forceIndexLabel"><code class="xref py py-attr docutils literal"><span class="pre">forceIndexLabel</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.setFormula">
<code class="descname">setFormula</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula.setFormula"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula.setFormula" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.formula" title="pyspark.ml.feature.RFormula.formula"><code class="xref py py-attr docutils literal"><span class="pre">formula</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.RFormula.labelCol" title="pyspark.ml.feature.RFormula.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>formula=None</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>forceIndexLabel=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormula.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormula.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for RFormula.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormula.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormula.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.RFormulaModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">RFormulaModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#RFormulaModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.RFormula" title="pyspark.ml.feature.RFormula"><code class="xref py py-class docutils literal"><span class="pre">RFormula</span></code></a>. Fitting is required to determine the
factor levels of formula terms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.RFormulaModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.RFormulaModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.RFormulaModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.SQLTransformer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">SQLTransformer</code><span class="sig-paren">(</span><em>statement=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#SQLTransformer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer" title="Permalink to this definition"></a></dt>
<dd><p>Implements the transforms which are defined by SQL statement.
Currently we only support SQL syntax like ‘SELECT … FROM __THIS__’
where ‘__THIS__’ represents the underlying table of the input dataset.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;v1&quot;</span><span class="p">,</span> <span class="s2">&quot;v2&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sqlTrans</span> <span class="o">=</span> <span class="n">SQLTransformer</span><span class="p">(</span>
<span class="gp">... </span> <span class="n">statement</span><span class="o">=</span><span class="s2">&quot;SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sqlTrans</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sqlTransformerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/sql-transformer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sqlTrans</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">sqlTransformerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedSqlTrans</span> <span class="o">=</span> <span class="n">SQLTransformer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">sqlTransformerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedSqlTrans</span><span class="o">.</span><span class="n">getStatement</span><span class="p">()</span> <span class="o">==</span> <span class="n">sqlTrans</span><span class="o">.</span><span class="n">getStatement</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.getStatement">
<code class="descname">getStatement</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#SQLTransformer.getStatement"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.getStatement" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of statement or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.SQLTransformer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>statement=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#SQLTransformer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this SQLTransformer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.setStatement">
<code class="descname">setStatement</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#SQLTransformer.setStatement"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.setStatement" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.SQLTransformer.statement" title="pyspark.ml.feature.SQLTransformer.statement"><code class="xref py py-attr docutils literal"><span class="pre">statement</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.SQLTransformer.statement">
<code class="descname">statement</code><em class="property"> = Param(parent='undefined', name='statement', doc='SQL statement')</em><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.statement" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.SQLTransformer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.SQLTransformer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.StandardScaler">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">StandardScaler</code><span class="sig-paren">(</span><em>withMean=False</em>, <em>withStd=True</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler" title="Permalink to this definition"></a></dt>
<dd><p>Standardizes features by removing the mean and scaling to unit variance using column summary
statistics on the samples in the training set.</p>
<p>The “unit std” is computed using the <a class="reference external" href="https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">corrected sample standard deviation</a>,
which is computed as the square root of the unbiased sample variance.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">standardScaler</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;scaled&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">standardScaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">mean</span>
<span class="go">DenseVector([1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">std</span>
<span class="go">DenseVector([1.4142])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">scaled</span>
<span class="go">DenseVector([1.4142])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">standardScalerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/standard-scaler&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">standardScaler</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">standardScalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedStandardScaler</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">standardScalerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedStandardScaler</span><span class="o">.</span><span class="n">getWithMean</span><span class="p">()</span> <span class="o">==</span> <span class="n">standardScaler</span><span class="o">.</span><span class="n">getWithMean</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedStandardScaler</span><span class="o">.</span><span class="n">getWithStd</span><span class="p">()</span> <span class="o">==</span> <span class="n">standardScaler</span><span class="o">.</span><span class="n">getWithStd</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/standard-scaler-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">StandardScalerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">std</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">std</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">mean</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">mean</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getWithMean">
<code class="descname">getWithMean</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler.getWithMean"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getWithMean" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of withMean or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.getWithStd">
<code class="descname">getWithStd</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler.getWithStd"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.getWithStd" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of withStd or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScaler.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScaler.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScaler.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StandardScaler.inputCol" title="pyspark.ml.feature.StandardScaler.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StandardScaler.outputCol" title="pyspark.ml.feature.StandardScaler.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>withMean=False</em>, <em>withStd=True</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this StandardScaler.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.setWithMean">
<code class="descname">setWithMean</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler.setWithMean"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.setWithMean" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StandardScaler.withMean" title="pyspark.ml.feature.StandardScaler.withMean"><code class="xref py py-attr docutils literal"><span class="pre">withMean</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.setWithStd">
<code class="descname">setWithStd</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScaler.setWithStd"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.setWithStd" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StandardScaler.withStd" title="pyspark.ml.feature.StandardScaler.withStd"><code class="xref py py-attr docutils literal"><span class="pre">withStd</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScaler.withMean">
<code class="descname">withMean</code><em class="property"> = Param(parent='undefined', name='withMean', doc='Center data with mean')</em><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.withMean" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScaler.withStd">
<code class="descname">withStd</code><em class="property"> = Param(parent='undefined', name='withStd', doc='Scale to unit standard deviation')</em><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.withStd" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScaler.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScaler.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.StandardScalerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">StandardScalerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StandardScalerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.StandardScaler" title="pyspark.ml.feature.StandardScaler"><code class="xref py py-class docutils literal"><span class="pre">StandardScaler</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScalerModel.mean">
<code class="descname">mean</code><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.mean" title="Permalink to this definition"></a></dt>
<dd><p>Mean of the StandardScalerModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScalerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StandardScalerModel.std">
<code class="descname">std</code><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.std" title="Permalink to this definition"></a></dt>
<dd><p>Standard deviation of the StandardScalerModel.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StandardScalerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StandardScalerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.StopWordsRemover">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">StopWordsRemover</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>stopWords=None</em>, <em>caseSensitive=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover" title="Permalink to this definition"></a></dt>
<dd><p>A feature transformer that filters out stop words from input.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">null values from input array are preserved unless adding null to stopWords explicitly.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([([</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],)],</span> <span class="p">[</span><span class="s2">&quot;text&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">remover</span> <span class="o">=</span> <span class="n">StopWordsRemover</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;text&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;words&quot;</span><span class="p">,</span> <span class="n">stopWords</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;b&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">remover</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">words</span> <span class="o">==</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;c&#39;</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stopWordsRemoverPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/stopwords-remover&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">remover</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">stopWordsRemoverPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRemover</span> <span class="o">=</span> <span class="n">StopWordsRemover</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">stopWordsRemoverPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRemover</span><span class="o">.</span><span class="n">getStopWords</span><span class="p">()</span> <span class="o">==</span> <span class="n">remover</span><span class="o">.</span><span class="n">getStopWords</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedRemover</span><span class="o">.</span><span class="n">getCaseSensitive</span><span class="p">()</span> <span class="o">==</span> <span class="n">remover</span><span class="o">.</span><span class="n">getCaseSensitive</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.feature.StopWordsRemover.caseSensitive">
<code class="descname">caseSensitive</code><em class="property"> = Param(parent='undefined', name='caseSensitive', doc='whether to do a case sensitive comparison over the stop words')</em><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.caseSensitive" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getCaseSensitive">
<code class="descname">getCaseSensitive</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.getCaseSensitive"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getCaseSensitive" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.caseSensitive" title="pyspark.ml.feature.StopWordsRemover.caseSensitive"><code class="xref py py-attr docutils literal"><span class="pre">caseSensitive</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.getStopWords">
<code class="descname">getStopWords</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.getStopWords"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.getStopWords" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.stopWords" title="pyspark.ml.feature.StopWordsRemover.stopWords"><code class="xref py py-attr docutils literal"><span class="pre">stopWords</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StopWordsRemover.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.feature.StopWordsRemover.loadDefaultStopWords">
<em class="property">static </em><code class="descname">loadDefaultStopWords</code><span class="sig-paren">(</span><em>language</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.loadDefaultStopWords"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.loadDefaultStopWords" title="Permalink to this definition"></a></dt>
<dd><p>Loads the default stop words for the given language.
Supported languages: danish, dutch, english, finnish, french, german, hungarian,
italian, norwegian, portuguese, russian, spanish, swedish, turkish</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StopWordsRemover.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StopWordsRemover.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.setCaseSensitive">
<code class="descname">setCaseSensitive</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.setCaseSensitive"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.setCaseSensitive" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.caseSensitive" title="pyspark.ml.feature.StopWordsRemover.caseSensitive"><code class="xref py py-attr docutils literal"><span class="pre">caseSensitive</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.inputCol" title="pyspark.ml.feature.StopWordsRemover.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.outputCol" title="pyspark.ml.feature.StopWordsRemover.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>stopWords=None</em>, <em>caseSensitive=false</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this StopWordRemover.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.setStopWords">
<code class="descname">setStopWords</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StopWordsRemover.setStopWords"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.setStopWords" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StopWordsRemover.stopWords" title="pyspark.ml.feature.StopWordsRemover.stopWords"><code class="xref py py-attr docutils literal"><span class="pre">stopWords</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StopWordsRemover.stopWords">
<code class="descname">stopWords</code><em class="property"> = Param(parent='undefined', name='stopWords', doc='The words to be filtered out')</em><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.stopWords" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StopWordsRemover.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StopWordsRemover.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.StringIndexer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">StringIndexer</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>handleInvalid='error'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StringIndexer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer" title="Permalink to this definition"></a></dt>
<dd><p>A label indexer that maps a string column of labels to an ML column of label indices.
If the input column is numeric, we cast it to string and index the string values.
The indices are in [0, numLabels), ordered by label frequencies.
So the most frequent label gets index 0.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s1">&#39;error&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">td</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">td</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">indexed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">inverter</span> <span class="o">=</span> <span class="n">IndexToString</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;label2&quot;</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="n">model</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">itd</span> <span class="o">=</span> <span class="n">inverter</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">itd</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">itd</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">itd</span><span class="o">.</span><span class="n">label2</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, &#39;a&#39;), (1, &#39;b&#39;), (2, &#39;c&#39;), (3, &#39;a&#39;), (4, &#39;a&#39;), (5, &#39;c&#39;)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/string-indexer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">stringIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">stringIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">()</span> <span class="o">==</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/string-indexer-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">labels</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">labels</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">indexToStringPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/index-to-string&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">inverter</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">indexToStringPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedInverter</span> <span class="o">=</span> <span class="n">IndexToString</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">indexToStringPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedInverter</span><span class="o">.</span><span class="n">getLabels</span><span class="p">()</span> <span class="o">==</span> <span class="n">inverter</span><span class="o">.</span><span class="n">getLabels</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.getHandleInvalid">
<code class="descname">getHandleInvalid</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of handleInvalid or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexer.handleInvalid">
<code class="descname">handleInvalid</code><em class="property"> = Param(parent='undefined', name='handleInvalid', doc='how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.setHandleInvalid">
<code class="descname">setHandleInvalid</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="pyspark.ml.feature.StringIndexer.handleInvalid"><code class="xref py py-attr docutils literal"><span class="pre">handleInvalid</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCol" title="pyspark.ml.feature.StringIndexer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCol" title="pyspark.ml.feature.StringIndexer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>handleInvalid=&quot;error&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StringIndexer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this StringIndexer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.StringIndexerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">StringIndexerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#StringIndexerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.StringIndexer" title="pyspark.ml.feature.StringIndexer"><code class="xref py py-class docutils literal"><span class="pre">StringIndexer</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexerModel.labels">
<code class="descname">labels</code><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.labels" title="Permalink to this definition"></a></dt>
<dd><p>Ordered list of labels, corresponding to indices to be assigned.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.StringIndexerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.StringIndexerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Tokenizer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Tokenizer</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Tokenizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Tokenizer" title="Permalink to this definition"></a></dt>
<dd><p>A tokenizer that converts the input string to lowercase and then
splits it by white spaces.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;a b c&quot;</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;text&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;text&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;words&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Change a parameter.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;tokens&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Temporarily modify a parameter.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">{</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;words&quot;</span><span class="p">})</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Must use keyword arguments to specify params.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="s2">&quot;text&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">TypeError</span>: <span class="n">Method setParams forces keyword arguments.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/tokenizer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tokenizer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">tokenizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedTokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">tokenizerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">tokens</span> <span class="o">==</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">tokens</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Tokenizer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Tokenizer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Tokenizer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Tokenizer.inputCol" title="pyspark.ml.feature.Tokenizer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Tokenizer.outputCol" title="pyspark.ml.feature.Tokenizer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Tokenizer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Tokenizer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Tokenizer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Tokenizer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.VectorAssembler">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">VectorAssembler</code><span class="sig-paren">(</span><em>inputCols=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorAssembler"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler" title="Permalink to this definition"></a></dt>
<dd><p>A feature transformer that merges multiple columns into a vector column.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vecAssembler</span> <span class="o">=</span> <span class="n">VectorAssembler</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vecAssembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">features</span>
<span class="go">DenseVector([1.0, 0.0, 3.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vecAssembler</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;freqs&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">DenseVector([1.0, 0.0, 3.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">vecAssembler</span><span class="o">.</span><span class="n">inputCols</span><span class="p">:</span> <span class="p">[</span><span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">],</span> <span class="n">vecAssembler</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vecAssembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">DenseVector([0.0, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vectorAssemblerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/vector-assembler&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vecAssembler</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">vectorAssemblerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedAssembler</span> <span class="o">=</span> <span class="n">VectorAssembler</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">vectorAssemblerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedAssembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span> <span class="o">==</span> <span class="n">vecAssembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">freqs</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.getInputCols">
<code class="descname">getInputCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.getInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCols or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorAssembler.inputCols">
<code class="descname">inputCols</code><em class="property"> = Param(parent='undefined', name='inputCols', doc='input column names.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.inputCols" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorAssembler.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorAssembler.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.setInputCols">
<code class="descname">setInputCols</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.setInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorAssembler.inputCols" title="pyspark.ml.feature.VectorAssembler.inputCols"><code class="xref py py-attr docutils literal"><span class="pre">inputCols</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorAssembler.outputCol" title="pyspark.ml.feature.VectorAssembler.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>inputCols=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorAssembler.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this VectorAssembler.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorAssembler.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorAssembler.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.VectorIndexer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">VectorIndexer</code><span class="sig-paren">(</span><em>maxCategories=20</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorIndexer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer" title="Permalink to this definition"></a></dt>
<dd><p>Class for indexing categorical feature columns in a dataset of <cite>Vector</cite>.</p>
<dl class="docutils">
<dt>This has 2 usage modes:</dt>
<dd><blockquote class="first">
<div><ul class="simple">
<li><dl class="first docutils">
<dt>Automatically identify categorical features (default behavior)</dt>
<dd><ul class="first last">
<li>This helps process a dataset of unknown vectors into a dataset with some continuous
features and some categorical features. The choice between continuous and categorical
is based upon a maxCategories parameter.</li>
<li>Set maxCategories to the maximum number of categorical any categorical feature should
have.</li>
<li>E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},
and feature 1 will be declared continuous.</li>
</ul>
</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>Index all features, if all features are categorical</dt>
<dd><ul class="first last">
<li>If maxCategories is set to be very large, then this will build an index of unique
values for all features.</li>
<li>Warning: This can cause problems if features are continuous since this will collect ALL
unique values to the driver.</li>
<li>E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
If maxCategories &gt;= 3, then both features will be declared categorical.</li>
</ul>
</dd>
</dl>
</li>
</ul>
</div></blockquote>
<p class="last">This returns a model which can transform categorical features to use 0-based indices.</p>
</dd>
<dt>Index stability:</dt>
<dd><blockquote class="first">
<div><ul class="simple">
<li>This is not guaranteed to choose the same category index across multiple runs.</li>
<li>If a categorical feature includes value 0, then this is guaranteed to map value 0 to
index 0. This maintains vector sparsity.</li>
<li>More stability may be added in the future.</li>
</ul>
</div></blockquote>
<dl class="last docutils">
<dt>TODO: Future extensions: The following functionality is planned for the future:</dt>
<dd><ul class="first last simple">
<li>Preserve metadata in transform; if a feature’s metadata is already present,
do not recompute.</li>
<li>Specify certain features to not index, either via a parameter or via existing metadata.</li>
<li>Add warning if a categorical feature has only 1 category.</li>
<li>Add option for allowing unknown categories.</li>
</ul>
</dd>
</dl>
</dd>
</dl>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">indexer</span> <span class="o">=</span> <span class="n">VectorIndexer</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">indexed</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">categoryMaps</span>
<span class="go">{0: {0.0: 0, -1.0: 1}}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">indexer</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;test&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">test</span>
<span class="go">DenseVector([0.0, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="n">indexer</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">:</span> <span class="mi">3</span><span class="p">,</span> <span class="n">indexer</span><span class="o">.</span><span class="n">outputCol</span><span class="p">:</span> <span class="s2">&quot;vector&quot;</span><span class="p">}</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vectorIndexerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/vector-indexer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">indexer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">vectorIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span> <span class="o">=</span> <span class="n">VectorIndexer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">vectorIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span><span class="o">.</span><span class="n">getMaxCategories</span><span class="p">()</span> <span class="o">==</span> <span class="n">indexer</span><span class="o">.</span><span class="n">getMaxCategories</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/vector-indexer-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">VectorIndexerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">numFeatures</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">categoryMaps</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">categoryMaps</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.getMaxCategories">
<code class="descname">getMaxCategories</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorIndexer.getMaxCategories"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.getMaxCategories" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxCategories or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexer.maxCategories">
<code class="descname">maxCategories</code><em class="property"> = Param(parent='undefined', name='maxCategories', doc='Threshold for the number of values a categorical feature can take (&gt;= 2). If a feature is found to have &gt; maxCategories values, then it is declared continuous.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.maxCategories" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorIndexer.inputCol" title="pyspark.ml.feature.VectorIndexer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.setMaxCategories">
<code class="descname">setMaxCategories</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorIndexer.setMaxCategories"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.setMaxCategories" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorIndexer.maxCategories" title="pyspark.ml.feature.VectorIndexer.maxCategories"><code class="xref py py-attr docutils literal"><span class="pre">maxCategories</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorIndexer.outputCol" title="pyspark.ml.feature.VectorIndexer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>maxCategories=20</em>, <em>inputCol=None</em>, <em>outputCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorIndexer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this VectorIndexer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.VectorIndexerModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">VectorIndexerModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorIndexerModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.VectorIndexer" title="pyspark.ml.feature.VectorIndexer"><code class="xref py py-class docutils literal"><span class="pre">VectorIndexer</span></code></a>.</p>
<dl class="docutils">
<dt>Transform categorical features to use 0-based indices instead of their original values.</dt>
<dd><ul class="first last simple">
<li>Categorical features are mapped to indices.</li>
<li>Continuous features (columns) are left unchanged.</li>
</ul>
</dd>
</dl>
<p>This also appends metadata to the output column, marking features as Numeric (continuous),
Nominal (categorical), or Binary (either continuous or categorical).
Non-ML metadata is not carried over from the input to the output column.</p>
<p>This maintains vector sparsity.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexerModel.categoryMaps">
<code class="descname">categoryMaps</code><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.categoryMaps" title="Permalink to this definition"></a></dt>
<dd><p>Feature value index. Keys are categorical feature indices (column indices).
Values are maps from original features values to 0-based category indices.
If a feature is not in this map, it is treated as continuous.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexerModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Number of features, i.e., length of Vectors which this transforms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorIndexerModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorIndexerModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorIndexerModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.VectorSlicer">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">VectorSlicer</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>indices=None</em>, <em>names=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer" title="Permalink to this definition"></a></dt>
<dd><p>This class takes a feature vector and outputs a new feature vector with a subarray
of the original features.</p>
<p>The subset of features can be specified with either indices (<cite>setIndices()</cite>)
or names (<cite>setNames()</cite>). At least one feature must be selected. Duplicate features
are not allowed, so there can be no overlap between selected indices and names.</p>
<p>The output vector will order features with the selected indices first (in the order given),
followed by the selected names (in the order given).</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.3</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.6</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="p">,</span> <span class="o">-</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.5</span><span class="p">,</span> <span class="mf">3.3</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vs</span> <span class="o">=</span> <span class="n">VectorSlicer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;sliced&quot;</span><span class="p">,</span> <span class="n">indices</span><span class="o">=</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vs</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">sliced</span>
<span class="go">DenseVector([2.3, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vectorSlicerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/vector-slicer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vs</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">vectorSlicerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedVs</span> <span class="o">=</span> <span class="n">VectorSlicer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">vectorSlicerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedVs</span><span class="o">.</span><span class="n">getIndices</span><span class="p">()</span> <span class="o">==</span> <span class="n">vs</span><span class="o">.</span><span class="n">getIndices</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedVs</span><span class="o">.</span><span class="n">getNames</span><span class="p">()</span> <span class="o">==</span> <span class="n">vs</span><span class="o">.</span><span class="n">getNames</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getIndices">
<code class="descname">getIndices</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer.getIndices"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getIndices" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of indices or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getNames">
<code class="descname">getNames</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer.getNames"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getNames" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of names or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorSlicer.indices">
<code class="descname">indices</code><em class="property"> = Param(parent='undefined', name='indices', doc='An array of indices to select features from a vector column. There can be no overlap with names.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.indices" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorSlicer.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorSlicer.names">
<code class="descname">names</code><em class="property"> = Param(parent='undefined', name='names', doc='An array of feature names to select features from a vector column. These names must be specified by ML org.apache.spark.ml.attribute.Attribute. There can be no overlap with indices.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.names" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorSlicer.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.VectorSlicer.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.setIndices">
<code class="descname">setIndices</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer.setIndices"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.setIndices" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorSlicer.indices" title="pyspark.ml.feature.VectorSlicer.indices"><code class="xref py py-attr docutils literal"><span class="pre">indices</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorSlicer.inputCol" title="pyspark.ml.feature.VectorSlicer.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.setNames">
<code class="descname">setNames</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer.setNames"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.setNames" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorSlicer.names" title="pyspark.ml.feature.VectorSlicer.names"><code class="xref py py-attr docutils literal"><span class="pre">names</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.VectorSlicer.outputCol" title="pyspark.ml.feature.VectorSlicer.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>inputCol=None</em>, <em>outputCol=None</em>, <em>indices=None</em>, <em>names=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#VectorSlicer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
Sets params for this VectorSlicer.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.VectorSlicer.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.VectorSlicer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Word2Vec">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Word2Vec</code><span class="sig-paren">(</span><em>vectorSize=100</em>, <em>minCount=5</em>, <em>numPartitions=1</em>, <em>stepSize=0.025</em>, <em>maxIter=1</em>, <em>seed=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>windowSize=5</em>, <em>maxSentenceLength=1000</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec" title="Permalink to this definition"></a></dt>
<dd><p>Word2Vec trains a model of <cite>Map(String, Vector)</cite>, i.e. transforms a word into a code for further
natural language processing or machine learning process.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">sent</span> <span class="o">=</span> <span class="p">(</span><span class="s2">&quot;a b &quot;</span> <span class="o">*</span> <span class="mi">100</span> <span class="o">+</span> <span class="s2">&quot;a c &quot;</span> <span class="o">*</span> <span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot; &quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">doc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">sent</span><span class="p">,),</span> <span class="p">(</span><span class="n">sent</span><span class="p">,)],</span> <span class="p">[</span><span class="s2">&quot;sentence&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">word2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;sentence&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;model&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">getVectors</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+----+--------------------+</span>
<span class="go">|word| vector|</span>
<span class="go">+----+--------------------+</span>
<span class="go">| a|[0.09461779892444...|</span>
<span class="go">| b|[1.15474212169647...|</span>
<span class="go">| c|[-0.3794820010662...|</span>
<span class="go">+----+--------------------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="k">import</span> <span class="n">format_number</span> <span class="k">as</span> <span class="n">fmt</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;word&quot;</span><span class="p">,</span> <span class="n">fmt</span><span class="p">(</span><span class="s2">&quot;similarity&quot;</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;similarity&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+----+----------+</span>
<span class="go">|word|similarity|</span>
<span class="go">+----+----------+</span>
<span class="go">| b| 0.25053|</span>
<span class="go">| c| -0.69805|</span>
<span class="go">+----+----------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">model</span>
<span class="go">DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">word2vecPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/word2vec&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">word2Vec</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">word2vecPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedWord2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">word2vecPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedWord2Vec</span><span class="o">.</span><span class="n">getVectorSize</span><span class="p">()</span> <span class="o">==</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">getVectorSize</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedWord2Vec</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedWord2Vec</span><span class="o">.</span><span class="n">getMinCount</span><span class="p">()</span> <span class="o">==</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">getMinCount</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/word2vec-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">Word2VecModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">getVectors</span><span class="p">()</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">word</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">getVectors</span><span class="p">()</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">word</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">getVectors</span><span class="p">()</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">getVectors</span><span class="p">()</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">vector</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getInputCol">
<code class="descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getMaxSentenceLength">
<code class="descname">getMaxSentenceLength</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.getMaxSentenceLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getMaxSentenceLength" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxSentenceLength or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getMinCount">
<code class="descname">getMinCount</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.getMinCount"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getMinCount" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minCount or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getNumPartitions">
<code class="descname">getNumPartitions</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.getNumPartitions"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getNumPartitions" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numPartitions or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getOutputCol">
<code class="descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getStepSize">
<code class="descname">getStepSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of stepSize or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getVectorSize">
<code class="descname">getVectorSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.getVectorSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getVectorSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of vectorSize or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.getWindowSize">
<code class="descname">getWindowSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.getWindowSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.getWindowSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of windowSize or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.inputCol">
<code class="descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.maxSentenceLength">
<code class="descname">maxSentenceLength</code><em class="property"> = Param(parent='undefined', name='maxSentenceLength', doc='Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks up to the size.')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.maxSentenceLength" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.minCount">
<code class="descname">minCount</code><em class="property"> = Param(parent='undefined', name='minCount', doc=&quot;the minimum number of times a token must appear to be included in the word2vec model's vocabulary&quot;)</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.minCount" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.numPartitions">
<code class="descname">numPartitions</code><em class="property"> = Param(parent='undefined', name='numPartitions', doc='number of partitions for sentences of words')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.numPartitions" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.outputCol">
<code class="descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setInputCol">
<code class="descname">setInputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.inputCol" title="pyspark.ml.feature.Word2Vec.inputCol"><code class="xref py py-attr docutils literal"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.maxIter" title="pyspark.ml.feature.Word2Vec.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setMaxSentenceLength">
<code class="descname">setMaxSentenceLength</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setMaxSentenceLength"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setMaxSentenceLength" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.maxSentenceLength" title="pyspark.ml.feature.Word2Vec.maxSentenceLength"><code class="xref py py-attr docutils literal"><span class="pre">maxSentenceLength</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setMinCount">
<code class="descname">setMinCount</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setMinCount"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setMinCount" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.minCount" title="pyspark.ml.feature.Word2Vec.minCount"><code class="xref py py-attr docutils literal"><span class="pre">minCount</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setNumPartitions">
<code class="descname">setNumPartitions</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setNumPartitions"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setNumPartitions" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.numPartitions" title="pyspark.ml.feature.Word2Vec.numPartitions"><code class="xref py py-attr docutils literal"><span class="pre">numPartitions</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setOutputCol">
<code class="descname">setOutputCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.outputCol" title="pyspark.ml.feature.Word2Vec.outputCol"><code class="xref py py-attr docutils literal"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>minCount=5</em>, <em>numPartitions=1</em>, <em>stepSize=0.025</em>, <em>maxIter=1</em>, <em>seed=None</em>, <em>inputCol=None</em>, <em>outputCol=None</em>, <em>windowSize=5</em>, <em>maxSentenceLength=1000</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this Word2Vec.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.seed" title="pyspark.ml.feature.Word2Vec.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setStepSize">
<code class="descname">setStepSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.stepSize" title="pyspark.ml.feature.Word2Vec.stepSize"><code class="xref py py-attr docutils literal"><span class="pre">stepSize</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setVectorSize">
<code class="descname">setVectorSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setVectorSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setVectorSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.vectorSize" title="pyspark.ml.feature.Word2Vec.vectorSize"><code class="xref py py-attr docutils literal"><span class="pre">vectorSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.setWindowSize">
<code class="descname">setWindowSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2Vec.setWindowSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.setWindowSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.Word2Vec.windowSize" title="pyspark.ml.feature.Word2Vec.windowSize"><code class="xref py py-attr docutils literal"><span class="pre">windowSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.stepSize">
<code class="descname">stepSize</code><em class="property"> = Param(parent='undefined', name='stepSize', doc='Step size to be used for each iteration of optimization (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.stepSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.vectorSize">
<code class="descname">vectorSize</code><em class="property"> = Param(parent='undefined', name='vectorSize', doc='the dimension of codes after transforming from words')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.vectorSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2Vec.windowSize">
<code class="descname">windowSize</code><em class="property"> = Param(parent='undefined', name='windowSize', doc='the window size (context words from [-window, window]). Default value is 5')</em><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.windowSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2Vec.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2Vec.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.feature.Word2VecModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.feature.</code><code class="descname">Word2VecModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2VecModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.feature.Word2Vec" title="pyspark.ml.feature.Word2Vec"><code class="xref py py-class docutils literal"><span class="pre">Word2Vec</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.findSynonyms">
<code class="descname">findSynonyms</code><span class="sig-paren">(</span><em>word</em>, <em>num</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2VecModel.findSynonyms"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.findSynonyms" title="Permalink to this definition"></a></dt>
<dd><p>Find “num” number of words closest in similarity to “word”.
word can be a string or vector representation.
Returns a dataframe with two fields word and similarity (which
gives the cosine similarity).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.getVectors">
<code class="descname">getVectors</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/feature.html#Word2VecModel.getVectors"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.getVectors" title="Permalink to this definition"></a></dt>
<dd><p>Returns the vector representation of the words as a dataframe
with two fields, word and vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.feature.Word2VecModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.feature.Word2VecModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.Word2VecModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.classification">
<span id="pyspark-ml-classification-module"></span><h2>pyspark.ml.classification module<a class="headerlink" href="#module-pyspark.ml.classification" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.classification.LinearSVC">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LinearSVC</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>tol=1e-06</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>fitIntercept=True</em>, <em>standardization=True</em>, <em>threshold=0.0</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LinearSVC"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LinearSVC" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p><a class="reference external" href="https://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM">Linear SVM Classifier</a></p>
<p>This binary classifier optimizes the Hinge Loss using the OWLQN optimizer.
Only supports L2 regularization currently.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="k">import</span> <span class="n">Row</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([0.0, -0.2792, -0.1833])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">1.0206118982229047</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numClasses</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">rawPrediction</span>
<span class="go">DenseVector([-1.4831, 1.4831])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/svm&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">svm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm2</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">svm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">svm2</span><span class="o">.</span><span class="n">getMaxIter</span><span class="p">()</span>
<span class="go">5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/svm_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">LinearSVCModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.aggregationDepth">
<code class="descname">aggregationDepth</code><em class="property"> = Param(parent='undefined', name='aggregationDepth', doc='suggested depth for treeAggregate (&gt;= 2).')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.aggregationDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.fitIntercept">
<code class="descname">fitIntercept</code><em class="property"> = Param(parent='undefined', name='fitIntercept', doc='whether to fit an intercept term.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.fitIntercept" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getAggregationDepth">
<code class="descname">getAggregationDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of aggregationDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getFitIntercept">
<code class="descname">getFitIntercept</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fitIntercept or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getRegParam">
<code class="descname">getRegParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of regParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getStandardization">
<code class="descname">getStandardization</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of standardization or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getThreshold">
<code class="descname">getThreshold</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LinearSVC.getThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of threshold or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.regParam">
<code class="descname">regParam</code><em class="property"> = Param(parent='undefined', name='regParam', doc='regularization parameter (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.regParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setAggregationDepth">
<code class="descname">setAggregationDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.aggregationDepth" title="pyspark.ml.classification.LinearSVC.aggregationDepth"><code class="xref py py-attr docutils literal"><span class="pre">aggregationDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.featuresCol" title="pyspark.ml.classification.LinearSVC.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setFitIntercept">
<code class="descname">setFitIntercept</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.fitIntercept" title="pyspark.ml.classification.LinearSVC.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">fitIntercept</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.labelCol" title="pyspark.ml.classification.LinearSVC.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.maxIter" title="pyspark.ml.classification.LinearSVC.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>tol=1e-06</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>fitIntercept=True</em>, <em>standardization=True</em>, <em>threshold=0.0</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LinearSVC.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, featuresCol=”features”, labelCol=”label”, predictionCol=”prediction”, maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol=”rawPrediction”, fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
Sets params for Linear SVM Classifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.predictionCol" title="pyspark.ml.classification.LinearSVC.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.rawPredictionCol" title="pyspark.ml.classification.LinearSVC.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setRegParam">
<code class="descname">setRegParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.regParam" title="pyspark.ml.classification.LinearSVC.regParam"><code class="xref py py-attr docutils literal"><span class="pre">regParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setStandardization">
<code class="descname">setStandardization</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.standardization" title="pyspark.ml.classification.LinearSVC.standardization"><code class="xref py py-attr docutils literal"><span class="pre">standardization</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setThreshold">
<code class="descname">setThreshold</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LinearSVC.setThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.threshold" title="pyspark.ml.classification.LinearSVC.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.tol" title="pyspark.ml.classification.LinearSVC.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LinearSVC.weightCol" title="pyspark.ml.classification.LinearSVC.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.standardization">
<code class="descname">standardization</code><em class="property"> = Param(parent='undefined', name='standardization', doc='whether to standardize the training features before fitting the model.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.standardization" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.threshold">
<code class="descname">threshold</code><em class="property"> = Param(parent='undefined', name='threshold', doc='The threshold in binary classification applied to the linear model prediction. This threshold can be any real number, where Inf will make all predictions 0.0 and -Inf will make all predictions 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.threshold" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVC.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVC.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVC.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.LinearSVCModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LinearSVCModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LinearSVCModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by LinearSVC.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVCModel.coefficients">
<code class="descname">coefficients</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.coefficients" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients of Linear SVM Classifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVCModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept of Linear SVM Classifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVCModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of classes (values which the label can take).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVCModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LinearSVCModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LinearSVCModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LinearSVCModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.LogisticRegression">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LogisticRegression</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>elasticNetParam=0.0</em>, <em>tol=1e-06</em>, <em>fitIntercept=True</em>, <em>threshold=0.5</em>, <em>thresholds=None</em>, <em>probabilityCol='probability'</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>standardization=True</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em>, <em>family='auto'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression" title="Permalink to this definition"></a></dt>
<dd><p>Logistic regression.
This class supports multinomial logistic (softmax) and binomial logistic regression.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="k">import</span> <span class="n">Row</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bdf</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">3.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">4.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blor</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">regParam</span><span class="o">=</span><span class="mf">0.01</span><span class="p">,</span> <span class="n">weightCol</span><span class="o">=</span><span class="s2">&quot;weight&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span> <span class="o">=</span> <span class="n">blor</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">bdf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([-1.080..., -0.646...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">3.112...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data_path</span> <span class="o">=</span> <span class="s2">&quot;data/mllib/sample_multiclass_classification_data.txt&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mdf</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;libsvm&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">data_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlor</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">regParam</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> <span class="n">elasticNetParam</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">family</span><span class="o">=</span><span class="s2">&quot;multinomial&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlorModel</span> <span class="o">=</span> <span class="n">mlor</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">mdf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlorModel</span><span class="o">.</span><span class="n">coefficientMatrix</span>
<span class="go">SparseMatrix(3, 4, [0, 1, 2, 3], [3, 2, 1], [1.87..., -2.75..., -0.50...], 1)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlorModel</span><span class="o">.</span><span class="n">interceptVector</span>
<span class="go">DenseVector([0.04..., -0.42..., 0.37...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">blorModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">probability</span>
<span class="go">DenseVector([0.02..., 0.97...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">rawPrediction</span>
<span class="go">DenseVector([-3.54..., 3.54...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blor</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="s2">&quot;vector&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">TypeError</span>: <span class="n">Method setParams forces keyword arguments.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blor</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">lr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr2</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">lr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr2</span><span class="o">.</span><span class="n">getRegParam</span><span class="p">()</span>
<span class="go">0.01</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">LogisticRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">blorModel</span><span class="o">.</span><span class="n">intercept</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.aggregationDepth">
<code class="descname">aggregationDepth</code><em class="property"> = Param(parent='undefined', name='aggregationDepth', doc='suggested depth for treeAggregate (&gt;= 2).')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.aggregationDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.elasticNetParam">
<code class="descname">elasticNetParam</code><em class="property"> = Param(parent='undefined', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.elasticNetParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.family">
<code class="descname">family</code><em class="property"> = Param(parent='undefined', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.family" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.fitIntercept">
<code class="descname">fitIntercept</code><em class="property"> = Param(parent='undefined', name='fitIntercept', doc='whether to fit an intercept term.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.fitIntercept" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getAggregationDepth">
<code class="descname">getAggregationDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of aggregationDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getElasticNetParam">
<code class="descname">getElasticNetParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getElasticNetParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of elasticNetParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getFamily">
<code class="descname">getFamily</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.getFamily"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getFamily" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.family" title="pyspark.ml.classification.LogisticRegression.family"><code class="xref py py-attr docutils literal"><span class="pre">family</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getFitIntercept">
<code class="descname">getFitIntercept</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fitIntercept or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getProbabilityCol">
<code class="descname">getProbabilityCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of probabilityCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getRegParam">
<code class="descname">getRegParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of regParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getStandardization">
<code class="descname">getStandardization</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of standardization or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getThreshold">
<code class="descname">getThreshold</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.getThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Get threshold for binary classification.</p>
<p>If <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.thresholds" title="pyspark.ml.classification.LogisticRegression.thresholds"><code class="xref py py-attr docutils literal"><span class="pre">thresholds</span></code></a> is set with length 2 (i.e., binary classification),
this returns the equivalent threshold:
<span class="math">\(\frac{1}{1 + \frac{thresholds(0)}{thresholds(1)}}\)</span>.
Otherwise, returns <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.threshold" title="pyspark.ml.classification.LogisticRegression.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a> if set or its default value if unset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getThresholds">
<code class="descname">getThresholds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.getThresholds"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getThresholds" title="Permalink to this definition"></a></dt>
<dd><p>If <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.thresholds" title="pyspark.ml.classification.LogisticRegression.thresholds"><code class="xref py py-attr docutils literal"><span class="pre">thresholds</span></code></a> is set, return its value.
Otherwise, if <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.threshold" title="pyspark.ml.classification.LogisticRegression.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a> is set, return the equivalent thresholds for binary
classification: (1-threshold, threshold).
If neither are set, throw an error.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.probabilityCol">
<code class="descname">probabilityCol</code><em class="property"> = Param(parent='undefined', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.probabilityCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.regParam">
<code class="descname">regParam</code><em class="property"> = Param(parent='undefined', name='regParam', doc='regularization parameter (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.regParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setAggregationDepth">
<code class="descname">setAggregationDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.aggregationDepth" title="pyspark.ml.classification.LogisticRegression.aggregationDepth"><code class="xref py py-attr docutils literal"><span class="pre">aggregationDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setElasticNetParam">
<code class="descname">setElasticNetParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setElasticNetParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.elasticNetParam" title="pyspark.ml.classification.LogisticRegression.elasticNetParam"><code class="xref py py-attr docutils literal"><span class="pre">elasticNetParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setFamily">
<code class="descname">setFamily</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.setFamily"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setFamily" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.family" title="pyspark.ml.classification.LogisticRegression.family"><code class="xref py py-attr docutils literal"><span class="pre">family</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.featuresCol" title="pyspark.ml.classification.LogisticRegression.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setFitIntercept">
<code class="descname">setFitIntercept</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.fitIntercept" title="pyspark.ml.classification.LogisticRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">fitIntercept</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.labelCol" title="pyspark.ml.classification.LogisticRegression.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.maxIter" title="pyspark.ml.classification.LogisticRegression.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>elasticNetParam=0.0</em>, <em>tol=1e-6</em>, <em>fitIntercept=True</em>, <em>threshold=0.5</em>, <em>thresholds=None</em>, <em>probabilityCol=&quot;probability&quot;</em>, <em>rawPredictionCol=&quot;rawPrediction&quot;</em>, <em>standardization=True</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em>, <em>family=&quot;auto&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.predictionCol" title="pyspark.ml.classification.LogisticRegression.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setProbabilityCol">
<code class="descname">setProbabilityCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.probabilityCol" title="pyspark.ml.classification.LogisticRegression.probabilityCol"><code class="xref py py-attr docutils literal"><span class="pre">probabilityCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.rawPredictionCol" title="pyspark.ml.classification.LogisticRegression.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setRegParam">
<code class="descname">setRegParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.regParam" title="pyspark.ml.classification.LogisticRegression.regParam"><code class="xref py py-attr docutils literal"><span class="pre">regParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setStandardization">
<code class="descname">setStandardization</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.standardization" title="pyspark.ml.classification.LogisticRegression.standardization"><code class="xref py py-attr docutils literal"><span class="pre">standardization</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setThreshold">
<code class="descname">setThreshold</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.setThreshold"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.threshold" title="pyspark.ml.classification.LogisticRegression.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a>.
Clears value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.thresholds" title="pyspark.ml.classification.LogisticRegression.thresholds"><code class="xref py py-attr docutils literal"><span class="pre">thresholds</span></code></a> if it has been set.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setThresholds">
<code class="descname">setThresholds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegression.setThresholds"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setThresholds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.thresholds" title="pyspark.ml.classification.LogisticRegression.thresholds"><code class="xref py py-attr docutils literal"><span class="pre">thresholds</span></code></a>.
Clears value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.threshold" title="pyspark.ml.classification.LogisticRegression.threshold"><code class="xref py py-attr docutils literal"><span class="pre">threshold</span></code></a> if it has been set.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.tol" title="pyspark.ml.classification.LogisticRegression.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.LogisticRegression.weightCol" title="pyspark.ml.classification.LogisticRegression.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.standardization">
<code class="descname">standardization</code><em class="property"> = Param(parent='undefined', name='standardization', doc='whether to standardize the training features before fitting the model.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.standardization" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.threshold">
<code class="descname">threshold</code><em class="property"> = Param(parent='undefined', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.threshold" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.thresholds">
<code class="descname">thresholds</code><em class="property"> = Param(parent='undefined', name='thresholds', doc=&quot;Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values &gt; 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold.&quot;)</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.thresholds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegression.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegression.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegression.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.LogisticRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LogisticRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by LogisticRegression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.coefficientMatrix">
<code class="descname">coefficientMatrix</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.coefficientMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.coefficients">
<code class="descname">coefficients</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.coefficients" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients of binomial logistic regression.
An exception is thrown in the case of multinomial logistic regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegressionModel.evaluate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the model on a test dataset.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dataset</strong> – Test dataset to evaluate model on, where dataset is an
instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept of binomial logistic regression.
An exception is thrown in the case of multinomial logistic regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.interceptVector">
<code class="descname">interceptVector</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.interceptVector" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of classes (values which the label can take).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model
trained on the training set. An exception is thrown if <cite>trainingSummary is None</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.LogisticRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.LogisticRegressionSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LogisticRegressionSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegressionSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Abstraction for Logistic Regression Results for a given model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionSummary.probabilityCol">
<code class="descname">probabilityCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionSummary.probabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the probability
of each class as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">LogisticRegressionTrainingSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#LogisticRegressionTrainingSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Abstraction for multinomial Logistic Regression Training results.
Currently, the training summary ignores the training weights except
for the objective trace.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.objectiveHistory">
<code class="descname">objectiveHistory</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.objectiveHistory" title="Permalink to this definition"></a></dt>
<dd><p>Objective function (scaled loss + regularization) at each
iteration.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.probabilityCol">
<code class="descname">probabilityCol</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.probabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the probability
of each class as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.LogisticRegressionTrainingSummary.totalIterations">
<code class="descname">totalIterations</code><a class="headerlink" href="#pyspark.ml.classification.LogisticRegressionTrainingSummary.totalIterations" title="Permalink to this definition"></a></dt>
<dd><p>Number of training iterations until termination.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">BinaryLogisticRegressionSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#BinaryLogisticRegressionSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Binary Logistic regression results for a given model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.areaUnderROC">
<code class="descname">areaUnderROC</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.areaUnderROC" title="Permalink to this definition"></a></dt>
<dd><p>Computes the area under the receiver operating characteristic
(ROC) curve.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.fMeasureByThreshold">
<code class="descname">fMeasureByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.fMeasureByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, F-Measure) curve
with beta = 1.0.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.pr">
<code class="descname">pr</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.pr" title="Permalink to this definition"></a></dt>
<dd><p>Returns the precision-recall curve, which is a Dataframe
containing two fields recall, precision with (0.0, 1.0) prepended
to it.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.precisionByThreshold">
<code class="descname">precisionByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.precisionByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, precision) curve.
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the precision.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.probabilityCol">
<code class="descname">probabilityCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.probabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the probability
of each class as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.recallByThreshold">
<code class="descname">recallByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.recallByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, recall) curve.
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the recall.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionSummary.roc">
<code class="descname">roc</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionSummary.roc" title="Permalink to this definition"></a></dt>
<dd><p>Returns the receiver operating characteristic (ROC) curve,
which is a Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="http://en.wikipedia.org/wiki/Receiver_operating_characteristic">Wikipedia reference</a></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">BinaryLogisticRegressionTrainingSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#BinaryLogisticRegressionTrainingSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Binary Logistic regression training results for a given model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.areaUnderROC">
<code class="descname">areaUnderROC</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.areaUnderROC" title="Permalink to this definition"></a></dt>
<dd><p>Computes the area under the receiver operating characteristic
(ROC) curve.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.fMeasureByThreshold">
<code class="descname">fMeasureByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.fMeasureByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, F-Measure) curve
with beta = 1.0.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.objectiveHistory">
<code class="descname">objectiveHistory</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.objectiveHistory" title="Permalink to this definition"></a></dt>
<dd><p>Objective function (scaled loss + regularization) at each
iteration.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.pr">
<code class="descname">pr</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.pr" title="Permalink to this definition"></a></dt>
<dd><p>Returns the precision-recall curve, which is a Dataframe
containing two fields recall, precision with (0.0, 1.0) prepended
to it.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.precisionByThreshold">
<code class="descname">precisionByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.precisionByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, precision) curve.
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the precision.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.probabilityCol">
<code class="descname">probabilityCol</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.probabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the probability
of each class as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.recallByThreshold">
<code class="descname">recallByThreshold</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.recallByThreshold" title="Permalink to this definition"></a></dt>
<dd><p>Returns a dataframe with two fields (threshold, recall) curve.
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the recall.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.roc">
<code class="descname">roc</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.roc" title="Permalink to this definition"></a></dt>
<dd><p>Returns the receiver operating characteristic (ROC) curve,
which is a Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="http://en.wikipedia.org/wiki/Receiver_operating_characteristic">Wikipedia reference</a></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LogisticRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.totalIterations">
<code class="descname">totalIterations</code><a class="headerlink" href="#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary.totalIterations" title="Permalink to this definition"></a></dt>
<dd><p>Number of training iterations until termination.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.DecisionTreeClassifier">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">DecisionTreeClassifier</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>probabilityCol='probability'</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity='gini'</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#DecisionTreeClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a>
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
features.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="k">import</span> <span class="n">StringIndexer</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">si_model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">si_model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt</span> <span class="o">=</span> <span class="n">DecisionTreeClassifier</span><span class="p">(</span><span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numNodes</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">depth</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numClasses</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">toDebugString</span><span class="p">)</span>
<span class="go">DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">probability</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">rawPrediction</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dtc_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/dtc&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">dtc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt2</span> <span class="o">=</span> <span class="n">DecisionTreeClassifier</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">dtc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt2</span><span class="o">.</span><span class="n">getMaxDepth</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/dtc_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">DecisionTreeClassificationModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getImpurity">
<code class="descname">getImpurity</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of impurity or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getProbabilityCol">
<code class="descname">getProbabilityCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of probabilityCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.impurity">
<code class="descname">impurity</code><em class="property"> = Param(parent='undefined', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.impurity" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.probabilityCol">
<code class="descname">probabilityCol</code><em class="property"> = Param(parent='undefined', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.probabilityCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.cacheNodeIds" title="pyspark.ml.classification.DecisionTreeClassifier.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.checkpointInterval" title="pyspark.ml.classification.DecisionTreeClassifier.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.featuresCol" title="pyspark.ml.classification.DecisionTreeClassifier.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setImpurity">
<code class="descname">setImpurity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.impurity" title="pyspark.ml.classification.DecisionTreeClassifier.impurity"><code class="xref py py-attr docutils literal"><span class="pre">impurity</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.labelCol" title="pyspark.ml.classification.DecisionTreeClassifier.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.maxBins" title="pyspark.ml.classification.DecisionTreeClassifier.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.maxDepth" title="pyspark.ml.classification.DecisionTreeClassifier.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.maxMemoryInMB" title="pyspark.ml.classification.DecisionTreeClassifier.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.minInfoGain" title="pyspark.ml.classification.DecisionTreeClassifier.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.minInstancesPerNode" title="pyspark.ml.classification.DecisionTreeClassifier.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>probabilityCol=&quot;probability&quot;</em>, <em>rawPredictionCol=&quot;rawPrediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity=&quot;gini&quot;</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#DecisionTreeClassifier.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for the DecisionTreeClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.predictionCol" title="pyspark.ml.classification.DecisionTreeClassifier.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setProbabilityCol">
<code class="descname">setProbabilityCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.probabilityCol" title="pyspark.ml.classification.DecisionTreeClassifier.probabilityCol"><code class="xref py py-attr docutils literal"><span class="pre">probabilityCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.rawPredictionCol" title="pyspark.ml.classification.DecisionTreeClassifier.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassifier.seed" title="pyspark.ml.classification.DecisionTreeClassifier.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.supportedImpurities">
<code class="descname">supportedImpurities</code><em class="property"> = ['entropy', 'gini']</em><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.supportedImpurities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassifier.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassifier.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">DecisionTreeClassificationModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#DecisionTreeClassificationModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by DecisionTreeClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.depth">
<code class="descname">depth</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.depth" title="Permalink to this definition"></a></dt>
<dd><p>Return depth of the decision tree.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>This generalizes the idea of “Gini” importance to other losses,
following the explanation of Gini importance from “Random Forests” documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.</p>
<dl class="docutils">
<dt>This feature importance is calculated as follows:</dt>
<dd><ul class="first last simple">
<li>importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node</li>
<li>Normalize importances for tree to sum to 1.</li>
</ul>
</dd>
</dl>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Feature importance for single decision trees can have high variance due to
correlated predictor variables. Consider using a <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier" title="pyspark.ml.classification.RandomForestClassifier"><code class="xref py py-class docutils literal"><span class="pre">RandomForestClassifier</span></code></a>
to determine feature importance instead.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of classes (values which the label can take).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.numNodes">
<code class="descname">numNodes</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.numNodes" title="Permalink to this definition"></a></dt>
<dd><p>Return number of nodes of the decision tree.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.DecisionTreeClassificationModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.DecisionTreeClassificationModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.GBTClassifier">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">GBTClassifier</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>lossType='logistic'</em>, <em>maxIter=20</em>, <em>stepSize=0.1</em>, <em>seed=None</em>, <em>subsamplingRate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#GBTClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
learning algorithm for classification.
It supports binary labels, as well as both continuous and categorical features.</p>
<p>The implementation is based upon: J.H. Friedman. “Stochastic Gradient Boosting.” 1999.</p>
<p>Notes on Gradient Boosting vs. TreeBoost:
- This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
- Both algorithms learn tree ensembles by minimizing loss functions.
- TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
based on the loss function, whereas the original gradient boosting method does not.
- We expect to implement TreeBoost in the future:
<a class="reference external" href="https://issues.apache.org/jira/browse/SPARK-4240">SPARK-4240</a></p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Multiclass labels are not currently supported.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy</span> <span class="k">import</span> <span class="n">allclose</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="k">import</span> <span class="n">StringIndexer</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">si_model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">si_model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt</span> <span class="o">=</span> <span class="n">GBTClassifier</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">gbt</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">allclose</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">totalNumNodes</span>
<span class="go">15</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">toDebugString</span><span class="p">)</span>
<span class="go">GBTClassificationModel (uid=...)...with 5 trees...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbtc_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;gbtc&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">gbtc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt2</span> <span class="o">=</span> <span class="n">GBTClassifier</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">gbtc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt2</span><span class="o">.</span><span class="n">getMaxDepth</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;gbtc_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">GBTClassificationModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">treeWeights</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">trees</span>
<span class="go">[DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getLossType">
<code class="descname">getLossType</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#GBTClassifier.getLossType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getLossType" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of lossType or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getStepSize">
<code class="descname">getStepSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of stepSize or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.getSubsamplingRate">
<code class="descname">getSubsamplingRate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.getSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of subsamplingRate or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.lossType">
<code class="descname">lossType</code><em class="property"> = Param(parent='undefined', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.lossType" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.cacheNodeIds" title="pyspark.ml.classification.GBTClassifier.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.checkpointInterval" title="pyspark.ml.classification.GBTClassifier.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.featuresCol" title="pyspark.ml.classification.GBTClassifier.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.labelCol" title="pyspark.ml.classification.GBTClassifier.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setLossType">
<code class="descname">setLossType</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#GBTClassifier.setLossType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setLossType" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.lossType" title="pyspark.ml.classification.GBTClassifier.lossType"><code class="xref py py-attr docutils literal"><span class="pre">lossType</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.maxBins" title="pyspark.ml.classification.GBTClassifier.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.maxDepth" title="pyspark.ml.classification.GBTClassifier.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.maxIter" title="pyspark.ml.classification.GBTClassifier.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.maxMemoryInMB" title="pyspark.ml.classification.GBTClassifier.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.minInfoGain" title="pyspark.ml.classification.GBTClassifier.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.minInstancesPerNode" title="pyspark.ml.classification.GBTClassifier.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>lossType=&quot;logistic&quot;</em>, <em>maxIter=20</em>, <em>stepSize=0.1</em>, <em>seed=None</em>, <em>subsamplingRate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#GBTClassifier.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for Gradient Boosted Tree Classification.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.predictionCol" title="pyspark.ml.classification.GBTClassifier.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.seed" title="pyspark.ml.classification.GBTClassifier.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setStepSize">
<code class="descname">setStepSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.stepSize" title="pyspark.ml.classification.GBTClassifier.stepSize"><code class="xref py py-attr docutils literal"><span class="pre">stepSize</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.setSubsamplingRate">
<code class="descname">setSubsamplingRate</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.setSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.GBTClassifier.subsamplingRate" title="pyspark.ml.classification.GBTClassifier.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.stepSize">
<code class="descname">stepSize</code><em class="property"> = Param(parent='undefined', name='stepSize', doc='Step size to be used for each iteration of optimization (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.stepSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.subsamplingRate">
<code class="descname">subsamplingRate</code><em class="property"> = Param(parent='undefined', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].')</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.subsamplingRate" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassifier.supportedLossTypes">
<code class="descname">supportedLossTypes</code><em class="property"> = ['logistic']</em><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.supportedLossTypes" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassifier.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassifier.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.GBTClassificationModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">GBTClassificationModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#GBTClassificationModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by GBTClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>Each feature’s importance is the average of its importance across all trees in the ensemble
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
(Hastie, Tibshirani, Friedman. “The Elements of Statistical Learning, 2nd Edition.” 2001.)
and follows the implementation from scikit-learn.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances" title="pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances"><code class="xref py py-attr docutils literal"><span class="pre">DecisionTreeClassificationModel.featureImportances</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.getNumTrees">
<code class="descname">getNumTrees</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.totalNumNodes">
<code class="descname">totalNumNodes</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.treeWeights">
<code class="descname">treeWeights</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.treeWeights" title="Permalink to this definition"></a></dt>
<dd><p>Return the weights for each tree</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.GBTClassificationModel.trees">
<code class="descname">trees</code><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.trees" title="Permalink to this definition"></a></dt>
<dd><p>Trees in this ensemble. Warning: These have null parent Estimators.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.GBTClassificationModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.GBTClassificationModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.RandomForestClassifier">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">RandomForestClassifier</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>probabilityCol='probability'</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity='gini'</em>, <em>numTrees=20</em>, <em>featureSubsetStrategy='auto'</em>, <em>seed=None</em>, <em>subsamplingRate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#RandomForestClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
features.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy</span> <span class="k">import</span> <span class="n">allclose</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="k">import</span> <span class="n">StringIndexer</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">si_model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">si_model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span> <span class="o">=</span> <span class="n">RandomForestClassifier</span><span class="p">(</span><span class="n">numTrees</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">rf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">allclose</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">numpy</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">probability</span><span class="p">)</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">numpy</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">rawPrediction</span><span class="p">)</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">trees</span>
<span class="go">[DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rfc_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rfc&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">rfc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf2</span> <span class="o">=</span> <span class="n">RandomForestClassifier</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">rfc_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf2</span><span class="o">.</span><span class="n">getNumTrees</span><span class="p">()</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rfc_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">RandomForestClassificationModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.featureSubsetStrategy">
<code class="descname">featureSubsetStrategy</code><em class="property"> = Param(parent='undefined', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.featureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getFeatureSubsetStrategy">
<code class="descname">getFeatureSubsetStrategy</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getFeatureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featureSubsetStrategy or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getImpurity">
<code class="descname">getImpurity</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of impurity or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getNumTrees">
<code class="descname">getNumTrees</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numTrees or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getProbabilityCol">
<code class="descname">getProbabilityCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of probabilityCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.getSubsamplingRate">
<code class="descname">getSubsamplingRate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.getSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of subsamplingRate or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.impurity">
<code class="descname">impurity</code><em class="property"> = Param(parent='undefined', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.impurity" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.numTrees">
<code class="descname">numTrees</code><em class="property"> = Param(parent='undefined', name='numTrees', doc='Number of trees to train (&gt;= 1).')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.numTrees" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.probabilityCol">
<code class="descname">probabilityCol</code><em class="property"> = Param(parent='undefined', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.probabilityCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.cacheNodeIds" title="pyspark.ml.classification.RandomForestClassifier.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.checkpointInterval" title="pyspark.ml.classification.RandomForestClassifier.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setFeatureSubsetStrategy">
<code class="descname">setFeatureSubsetStrategy</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setFeatureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.featureSubsetStrategy" title="pyspark.ml.classification.RandomForestClassifier.featureSubsetStrategy"><code class="xref py py-attr docutils literal"><span class="pre">featureSubsetStrategy</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.featuresCol" title="pyspark.ml.classification.RandomForestClassifier.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setImpurity">
<code class="descname">setImpurity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.impurity" title="pyspark.ml.classification.RandomForestClassifier.impurity"><code class="xref py py-attr docutils literal"><span class="pre">impurity</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.labelCol" title="pyspark.ml.classification.RandomForestClassifier.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.maxBins" title="pyspark.ml.classification.RandomForestClassifier.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.maxDepth" title="pyspark.ml.classification.RandomForestClassifier.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.maxMemoryInMB" title="pyspark.ml.classification.RandomForestClassifier.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.minInfoGain" title="pyspark.ml.classification.RandomForestClassifier.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.minInstancesPerNode" title="pyspark.ml.classification.RandomForestClassifier.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setNumTrees">
<code class="descname">setNumTrees</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.numTrees" title="pyspark.ml.classification.RandomForestClassifier.numTrees"><code class="xref py py-attr docutils literal"><span class="pre">numTrees</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>probabilityCol=&quot;probability&quot;</em>, <em>rawPredictionCol=&quot;rawPrediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>seed=None</em>, <em>impurity=&quot;gini&quot;</em>, <em>numTrees=20</em>, <em>featureSubsetStrategy=&quot;auto&quot;</em>, <em>subsamplingRate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#RandomForestClassifier.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for linear classification.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.predictionCol" title="pyspark.ml.classification.RandomForestClassifier.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setProbabilityCol">
<code class="descname">setProbabilityCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.probabilityCol" title="pyspark.ml.classification.RandomForestClassifier.probabilityCol"><code class="xref py py-attr docutils literal"><span class="pre">probabilityCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.rawPredictionCol" title="pyspark.ml.classification.RandomForestClassifier.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.seed" title="pyspark.ml.classification.RandomForestClassifier.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.setSubsamplingRate">
<code class="descname">setSubsamplingRate</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.setSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.RandomForestClassifier.subsamplingRate" title="pyspark.ml.classification.RandomForestClassifier.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.subsamplingRate">
<code class="descname">subsamplingRate</code><em class="property"> = Param(parent='undefined', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].')</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.subsamplingRate" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.supportedFeatureSubsetStrategies">
<code class="descname">supportedFeatureSubsetStrategies</code><em class="property"> = ['auto', 'all', 'onethird', 'sqrt', 'log2']</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.supportedFeatureSubsetStrategies" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassifier.supportedImpurities">
<code class="descname">supportedImpurities</code><em class="property"> = ['entropy', 'gini']</em><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.supportedImpurities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassifier.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassifier.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.RandomForestClassificationModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">RandomForestClassificationModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#RandomForestClassificationModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by RandomForestClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>Each feature’s importance is the average of its importance across all trees in the ensemble
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
(Hastie, Tibshirani, Friedman. “The Elements of Statistical Learning, 2nd Edition.” 2001.)
and follows the implementation from scikit-learn.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances" title="pyspark.ml.classification.DecisionTreeClassificationModel.featureImportances"><code class="xref py py-attr docutils literal"><span class="pre">DecisionTreeClassificationModel.featureImportances</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.getNumTrees">
<code class="descname">getNumTrees</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of classes (values which the label can take).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.totalNumNodes">
<code class="descname">totalNumNodes</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.treeWeights">
<code class="descname">treeWeights</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.treeWeights" title="Permalink to this definition"></a></dt>
<dd><p>Return the weights for each tree</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.trees">
<code class="descname">trees</code><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.trees" title="Permalink to this definition"></a></dt>
<dd><p>Trees in this ensemble. Warning: These have null parent Estimators.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.RandomForestClassificationModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.RandomForestClassificationModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.NaiveBayes">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">NaiveBayes</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>probabilityCol='probability'</em>, <em>rawPredictionCol='rawPrediction'</em>, <em>smoothing=1.0</em>, <em>modelType='multinomial'</em>, <em>thresholds=None</em>, <em>weightCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes" title="Permalink to this definition"></a></dt>
<dd><p>Naive Bayes Classifiers.
It supports both Multinomial and Bernoulli NB. <a class="reference external" href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html">Multinomial NB</a>
can handle finitely supported discrete data. For example, by converting documents into
TF-IDF vectors, it can be used for document classification. By making every vector a
binary (0/1) data, it can also be used as <a class="reference external" href="http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html">Bernoulli NB</a>.
The input feature values must be nonnegative.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="k">import</span> <span class="n">Row</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="n">label</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">weight</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]))])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="p">(</span><span class="n">smoothing</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">modelType</span><span class="o">=</span><span class="s2">&quot;multinomial&quot;</span><span class="p">,</span> <span class="n">weightCol</span><span class="o">=</span><span class="s2">&quot;weight&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">pi</span>
<span class="go">DenseVector([-0.81..., -0.58...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">theta</span>
<span class="go">DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">probability</span>
<span class="go">DenseVector([0.32..., 0.67...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">rawPrediction</span>
<span class="go">DenseVector([-1.72..., -0.99...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/nb&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">nb_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb2</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">nb_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb2</span><span class="o">.</span><span class="n">getSmoothing</span><span class="p">()</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/nb_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">NaiveBayesModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">pi</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">pi</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">theta</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">theta</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">nb</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">setThresholds</span><span class="p">([</span><span class="mf">0.01</span><span class="p">,</span> <span class="mf">10.00</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model3</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model3</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getModelType">
<code class="descname">getModelType</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes.getModelType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getModelType" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of modelType or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getProbabilityCol">
<code class="descname">getProbabilityCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of probabilityCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getSmoothing">
<code class="descname">getSmoothing</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes.getSmoothing"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getSmoothing" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of smoothing or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getThresholds">
<code class="descname">getThresholds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getThresholds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of thresholds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.modelType">
<code class="descname">modelType</code><em class="property"> = Param(parent='undefined', name='modelType', doc='The model type which is a string (case-sensitive). Supported options: multinomial (default) and bernoulli.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.modelType" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.probabilityCol">
<code class="descname">probabilityCol</code><em class="property"> = Param(parent='undefined', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.probabilityCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.featuresCol" title="pyspark.ml.classification.NaiveBayes.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.labelCol" title="pyspark.ml.classification.NaiveBayes.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setModelType">
<code class="descname">setModelType</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes.setModelType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setModelType" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.modelType" title="pyspark.ml.classification.NaiveBayes.modelType"><code class="xref py py-attr docutils literal"><span class="pre">modelType</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>probabilityCol=&quot;probability&quot;</em>, <em>rawPredictionCol=&quot;rawPrediction&quot;</em>, <em>smoothing=1.0</em>, <em>modelType=&quot;multinomial&quot;</em>, <em>thresholds=None</em>, <em>weightCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for Naive Bayes.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.predictionCol" title="pyspark.ml.classification.NaiveBayes.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setProbabilityCol">
<code class="descname">setProbabilityCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.probabilityCol" title="pyspark.ml.classification.NaiveBayes.probabilityCol"><code class="xref py py-attr docutils literal"><span class="pre">probabilityCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.rawPredictionCol" title="pyspark.ml.classification.NaiveBayes.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setSmoothing">
<code class="descname">setSmoothing</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayes.setSmoothing"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setSmoothing" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.smoothing" title="pyspark.ml.classification.NaiveBayes.smoothing"><code class="xref py py-attr docutils literal"><span class="pre">smoothing</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setThresholds">
<code class="descname">setThresholds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setThresholds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.thresholds" title="pyspark.ml.classification.NaiveBayes.thresholds"><code class="xref py py-attr docutils literal"><span class="pre">thresholds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.NaiveBayes.weightCol" title="pyspark.ml.classification.NaiveBayes.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.smoothing">
<code class="descname">smoothing</code><em class="property"> = Param(parent='undefined', name='smoothing', doc='The smoothing parameter, should be &gt;= 0, default is 1.0')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.smoothing" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.thresholds">
<code class="descname">thresholds</code><em class="property"> = Param(parent='undefined', name='thresholds', doc=&quot;Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values &gt; 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold.&quot;)</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.thresholds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayes.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayes.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayes.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.NaiveBayesModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">NaiveBayesModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#NaiveBayesModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by NaiveBayes.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayesModel.numClasses">
<code class="descname">numClasses</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.numClasses" title="Permalink to this definition"></a></dt>
<dd><p>Number of classes (values which the label can take).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayesModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayesModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayesModel.pi">
<code class="descname">pi</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.pi" title="Permalink to this definition"></a></dt>
<dd><p>log of class priors.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.NaiveBayesModel.theta">
<code class="descname">theta</code><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.theta" title="Permalink to this definition"></a></dt>
<dd><p>log of class conditional probabilities.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.NaiveBayesModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.NaiveBayesModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">MultilayerPerceptronClassifier</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=100</em>, <em>tol=1e-06</em>, <em>seed=None</em>, <em>layers=None</em>, <em>blockSize=128</em>, <em>stepSize=0.03</em>, <em>solver='l-bfgs'</em>, <em>initialWeights=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Classifier trainer based on the Multilayer Perceptron.
Each layer has sigmoid activation function, output layer has softmax.
Number of inputs has to be equal to the size of feature vectors.
Number of outputs has to be equal to the total number of labels.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp</span> <span class="o">=</span> <span class="n">MultilayerPerceptronClassifier</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">layers</span><span class="o">=</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="n">blockSize</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">123</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">mlp</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">layers</span>
<span class="go">[2, 2, 2]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">weights</span><span class="o">.</span><span class="n">size</span>
<span class="go">12</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">testDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">testDF</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---------+----------+</span>
<span class="go">| features|prediction|</span>
<span class="go">+---------+----------+</span>
<span class="go">|[1.0,0.0]| 1.0|</span>
<span class="go">|[0.0,0.0]| 0.0|</span>
<span class="go">+---------+----------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/mlp&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">mlp_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp2</span> <span class="o">=</span> <span class="n">MultilayerPerceptronClassifier</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">mlp_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp2</span><span class="o">.</span><span class="n">getBlockSize</span><span class="p">()</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/mlp_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">MultilayerPerceptronClassificationModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">layers</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">layers</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">weights</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">weights</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mlp2</span> <span class="o">=</span> <span class="n">mlp2</span><span class="o">.</span><span class="n">setInitialWeights</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">12</span><span class="p">)))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model3</span> <span class="o">=</span> <span class="n">mlp2</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model3</span><span class="o">.</span><span class="n">weights</span> <span class="o">!=</span> <span class="n">model2</span><span class="o">.</span><span class="n">weights</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model3</span><span class="o">.</span><span class="n">layers</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">layers</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.blockSize">
<code class="descname">blockSize</code><em class="property"> = Param(parent='undefined', name='blockSize', doc='Block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. Recommended size is between 10 and 1000, default is 128.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.blockSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getBlockSize">
<code class="descname">getBlockSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.getBlockSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getBlockSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of blockSize or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getInitialWeights">
<code class="descname">getInitialWeights</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.getInitialWeights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getInitialWeights" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of initialWeights or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getLayers">
<code class="descname">getLayers</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.getLayers"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getLayers" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of layers or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getSolver">
<code class="descname">getSolver</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.getSolver"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getSolver" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of solver or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getStepSize">
<code class="descname">getStepSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.getStepSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of stepSize or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.initialWeights">
<code class="descname">initialWeights</code><em class="property"> = Param(parent='undefined', name='initialWeights', doc='The initial weights of the model.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.initialWeights" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.layers">
<code class="descname">layers</code><em class="property"> = Param(parent='undefined', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.layers" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setBlockSize">
<code class="descname">setBlockSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setBlockSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setBlockSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.blockSize" title="pyspark.ml.classification.MultilayerPerceptronClassifier.blockSize"><code class="xref py py-attr docutils literal"><span class="pre">blockSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.featuresCol" title="pyspark.ml.classification.MultilayerPerceptronClassifier.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setInitialWeights">
<code class="descname">setInitialWeights</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setInitialWeights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setInitialWeights" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.initialWeights" title="pyspark.ml.classification.MultilayerPerceptronClassifier.initialWeights"><code class="xref py py-attr docutils literal"><span class="pre">initialWeights</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.labelCol" title="pyspark.ml.classification.MultilayerPerceptronClassifier.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setLayers">
<code class="descname">setLayers</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setLayers"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setLayers" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.layers" title="pyspark.ml.classification.MultilayerPerceptronClassifier.layers"><code class="xref py py-attr docutils literal"><span class="pre">layers</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.maxIter" title="pyspark.ml.classification.MultilayerPerceptronClassifier.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxIter=100</em>, <em>tol=1e-6</em>, <em>seed=None</em>, <em>layers=None</em>, <em>blockSize=128</em>, <em>stepSize=0.03</em>, <em>solver=&quot;l-bfgs&quot;</em>, <em>initialWeights=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for MultilayerPerceptronClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.predictionCol" title="pyspark.ml.classification.MultilayerPerceptronClassifier.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.seed" title="pyspark.ml.classification.MultilayerPerceptronClassifier.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setSolver">
<code class="descname">setSolver</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setSolver"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setSolver" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.solver" title="pyspark.ml.classification.MultilayerPerceptronClassifier.solver"><code class="xref py py-attr docutils literal"><span class="pre">solver</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setStepSize">
<code class="descname">setStepSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassifier.setStepSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.stepSize" title="pyspark.ml.classification.MultilayerPerceptronClassifier.stepSize"><code class="xref py py-attr docutils literal"><span class="pre">stepSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.tol" title="pyspark.ml.classification.MultilayerPerceptronClassifier.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.solver">
<code class="descname">solver</code><em class="property"> = Param(parent='undefined', name='solver', doc='The solver algorithm for optimization. Supported options: l-bfgs, gd.')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.solver" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.stepSize">
<code class="descname">stepSize</code><em class="property"> = Param(parent='undefined', name='stepSize', doc='Step size to be used for each iteration of optimization (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.stepSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassifier.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassifier.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">MultilayerPerceptronClassificationModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#MultilayerPerceptronClassificationModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by MultilayerPerceptronClassifier.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.layers">
<code class="descname">layers</code><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.layers" title="Permalink to this definition"></a></dt>
<dd><p>array of layer sizes including input and output layers.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>the weights of layers.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.MultilayerPerceptronClassificationModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.MultilayerPerceptronClassificationModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.OneVsRest">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">OneVsRest</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>classifier=None</em>, <em>weightCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Reduction of Multiclass Classification to Binary Classification.
Performs reduction using one against all strategy.
For a multiclass classification with k classes, train k models (one per class).
Each example is scored against all k models and the model with highest score
is picked to label the example.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="k">import</span> <span class="n">Row</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data_path</span> <span class="o">=</span> <span class="s2">&quot;data/mllib/sample_multiclass_classification_data.txt&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;libsvm&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">data_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">regParam</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ovr</span> <span class="o">=</span> <span class="n">OneVsRest</span><span class="p">(</span><span class="n">classifier</span><span class="o">=</span><span class="n">lr</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ovr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">models</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([0.5..., -1.0..., 3.4..., 4.2...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">models</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([-2.1..., 3.1..., -2.6..., -2.3...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">models</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([0.3..., -3.4..., 1.0..., -1.1...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">[</span><span class="n">x</span><span class="o">.</span><span class="n">intercept</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">models</span><span class="p">]</span>
<span class="go">[-2.7..., -2.5..., -1.3...]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test2</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">features</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.4</span><span class="p">,</span> <span class="mf">0.3</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">))])</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test2</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/ovr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">OneVsRestModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.classifier">
<code class="descname">classifier</code><em class="property"> = Param(parent='undefined', name='classifier', doc='base binary classifier')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.classifier" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This creates a deep copy of the embedded paramMap,
and copies the embedded and extra parameters over.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getClassifier">
<code class="descname">getClassifier</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of classifier or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.classification.OneVsRest.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setClassifier">
<code class="descname">setClassifier</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRest.classifier" title="pyspark.ml.classification.OneVsRest.classifier"><code class="xref py py-attr docutils literal"><span class="pre">classifier</span></code></a>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Only LogisticRegression and NaiveBayes are supported now.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRest.featuresCol" title="pyspark.ml.classification.OneVsRest.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRest.labelCol" title="pyspark.ml.classification.OneVsRest.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>featuresCol=None</em>, <em>labelCol=None</em>, <em>predictionCol=None</em>, <em>classifier=None</em>, <em>weightCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classifier=None, weightCol=None):
Sets params for OneVsRest.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRest.predictionCol" title="pyspark.ml.classification.OneVsRest.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRest.weightCol" title="pyspark.ml.classification.OneVsRest.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRest.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRest.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRest.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRest.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.classification.OneVsRestModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.classification.</code><code class="descname">OneVsRestModel</code><span class="sig-paren">(</span><em>models</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRestModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by OneVsRest.
This stores the models resulting from training k binary classifiers: one for each class.
Each example is scored against all k models, and the model with the highest score
is picked to label the example.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.classifier">
<code class="descname">classifier</code><em class="property"> = Param(parent='undefined', name='classifier', doc='base binary classifier')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.classifier" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRestModel.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This creates a deep copy of the embedded paramMap,
and copies the embedded and extra parameters over.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getClassifier">
<code class="descname">getClassifier</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of classifier or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.classification.OneVsRestModel.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRestModel.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRestModel.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.setClassifier">
<code class="descname">setClassifier</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.setClassifier" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRestModel.classifier" title="pyspark.ml.classification.OneVsRestModel.classifier"><code class="xref py py-attr docutils literal"><span class="pre">classifier</span></code></a>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Only LogisticRegression and NaiveBayes are supported now.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRestModel.featuresCol" title="pyspark.ml.classification.OneVsRestModel.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRestModel.labelCol" title="pyspark.ml.classification.OneVsRestModel.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRestModel.predictionCol" title="pyspark.ml.classification.OneVsRestModel.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.classification.OneVsRestModel.weightCol" title="pyspark.ml.classification.OneVsRestModel.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.classification.OneVsRestModel.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.classification.OneVsRestModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/classification.html#OneVsRestModel.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.classification.OneVsRestModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.clustering">
<span id="pyspark-ml-clustering-module"></span><h2>pyspark.ml.clustering module<a class="headerlink" href="#module-pyspark.ml.clustering" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.clustering.BisectingKMeans">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">BisectingKMeans</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=20</em>, <em>seed=None</em>, <em>k=4</em>, <em>minDivisibleClusterSize=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans" title="Permalink to this definition"></a></dt>
<dd><p>A bisecting k-means algorithm based on the paper “A comparison of document clustering
techniques” by Steinbach, Karypis, and Kumar, with modification to fit Spark.
The algorithm starts from a single cluster that contains all points.
Iteratively it finds divisible clusters on the bottom level and bisects each of them using
k-means, until there are <cite>k</cite> leaf clusters in total or no leaf clusters are divisible.
The bisecting steps of clusters on the same level are grouped together to increase parallelism.
If bisecting all divisible clusters on the bottom level would result more than <cite>k</cite> leaf
clusters, larger clusters get higher priority.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">9.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bkm</span> <span class="o">=</span> <span class="n">BisectingKMeans</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">minDivisibleClusterSize</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">bkm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">centers</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">centers</span><span class="p">)</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">computeCost</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="go">2.000...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">summary</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">k</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">clusterSizes</span>
<span class="go">[2, 2]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">transformed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;prediction&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">transformed</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bkm_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/bkm&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bkm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">bkm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bkm2</span> <span class="o">=</span> <span class="n">BisectingKMeans</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">bkm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bkm2</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/bkm_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">BisectingKMeansModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">array([ True, True], dtype=bool)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">array([ True, True], dtype=bool)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getK">
<code class="descname">getK</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans.getK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getK" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>k</cite> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getMinDivisibleClusterSize">
<code class="descname">getMinDivisibleClusterSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans.getMinDivisibleClusterSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getMinDivisibleClusterSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>minDivisibleClusterSize</cite> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.k">
<code class="descname">k</code><em class="property"> = Param(parent='undefined', name='k', doc='The desired number of leaf clusters. Must be &gt; 1.')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.k" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.minDivisibleClusterSize">
<code class="descname">minDivisibleClusterSize</code><em class="property"> = Param(parent='undefined', name='minDivisibleClusterSize', doc='The minimum number of points (if &gt;= 1.0) or the minimum proportion of points (if &lt; 1.0) of a divisible cluster.')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.minDivisibleClusterSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeans.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.featuresCol" title="pyspark.ml.clustering.BisectingKMeans.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setK" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.k" title="pyspark.ml.clustering.BisectingKMeans.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.maxIter" title="pyspark.ml.clustering.BisectingKMeans.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setMinDivisibleClusterSize">
<code class="descname">setMinDivisibleClusterSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans.setMinDivisibleClusterSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setMinDivisibleClusterSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.minDivisibleClusterSize" title="pyspark.ml.clustering.BisectingKMeans.minDivisibleClusterSize"><code class="xref py py-attr docutils literal"><span class="pre">minDivisibleClusterSize</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxIter=20</em>, <em>seed=None</em>, <em>k=4</em>, <em>minDivisibleClusterSize=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeans.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for BisectingKMeans.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.predictionCol" title="pyspark.ml.clustering.BisectingKMeans.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.BisectingKMeans.seed" title="pyspark.ml.clustering.BisectingKMeans.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeans.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeans.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.BisectingKMeansModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">BisectingKMeansModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeansModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by BisectingKMeans.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.clusterCenters">
<code class="descname">clusterCenters</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeansModel.clusterCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.clusterCenters" title="Permalink to this definition"></a></dt>
<dd><p>Get the cluster centers, represented as a list of NumPy arrays.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.computeCost">
<code class="descname">computeCost</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeansModel.computeCost"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.computeCost" title="Permalink to this definition"></a></dt>
<dd><p>Computes the sum of squared distances between the input points
and their corresponding cluster centers.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
training set. An exception is thrown if no summary exists.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.BisectingKMeansModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">BisectingKMeansSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#BisectingKMeansSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Bisecting KMeans clustering results for a given model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.cluster">
<code class="descname">cluster</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.cluster" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame of predicted cluster centers for each training data point.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.clusterSizes">
<code class="descname">clusterSizes</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.clusterSizes" title="Permalink to this definition"></a></dt>
<dd><p>Size of (number of data points in) each cluster.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Name for column of features in <cite>predictions</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.k" title="Permalink to this definition"></a></dt>
<dd><p>The number of clusters the model was trained with.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Name for column of predicted clusters in <cite>predictions</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.BisectingKMeansSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.clustering.BisectingKMeansSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame produced by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.KMeans">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">KMeans</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>predictionCol='prediction'</em>, <em>k=2</em>, <em>initMode='k-means||'</em>, <em>initSteps=2</em>, <em>tol=0.0001</em>, <em>maxIter=20</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans" title="Permalink to this definition"></a></dt>
<dd><p>K-means clustering with a k-means++ like initialization mode
(the k-means|| algorithm by Bahmani et al).</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">9.0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">8.0</span><span class="p">,</span> <span class="mf">9.0</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans</span> <span class="o">=</span> <span class="n">KMeans</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">kmeans</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">centers</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">centers</span><span class="p">)</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">computeCost</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="go">2.000...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">transformed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;prediction&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">transformed</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">summary</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">k</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">clusterSizes</span>
<span class="go">[2, 2]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/kmeans&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">kmeans_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans2</span> <span class="o">=</span> <span class="n">KMeans</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">kmeans_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans2</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/kmeans_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">KMeansModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">array([ True, True], dtype=bool)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">clusterCenters</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">array([ True, True], dtype=bool)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getInitMode">
<code class="descname">getInitMode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.getInitMode"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getInitMode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>initMode</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getInitSteps">
<code class="descname">getInitSteps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.getInitSteps"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getInitSteps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>initSteps</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getK">
<code class="descname">getK</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.getK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getK" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>k</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.initMode">
<code class="descname">initMode</code><em class="property"> = Param(parent='undefined', name='initMode', doc='The initialization algorithm. This can be either &quot;random&quot; to choose random points as initial cluster centers, or &quot;k-means||&quot; to use a parallel variant of k-means++')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.initMode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.initSteps">
<code class="descname">initSteps</code><em class="property"> = Param(parent='undefined', name='initSteps', doc='The number of steps for k-means|| initialization mode. Must be &gt; 0.')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.initSteps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.k">
<code class="descname">k</code><em class="property"> = Param(parent='undefined', name='k', doc='The number of clusters to create. Must be &gt; 1.')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.k" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.KMeans.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.featuresCol" title="pyspark.ml.clustering.KMeans.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setInitMode">
<code class="descname">setInitMode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.setInitMode"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setInitMode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.initMode" title="pyspark.ml.clustering.KMeans.initMode"><code class="xref py py-attr docutils literal"><span class="pre">initMode</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setInitSteps">
<code class="descname">setInitSteps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.setInitSteps"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setInitSteps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.initSteps" title="pyspark.ml.clustering.KMeans.initSteps"><code class="xref py py-attr docutils literal"><span class="pre">initSteps</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setK" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.k" title="pyspark.ml.clustering.KMeans.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.maxIter" title="pyspark.ml.clustering.KMeans.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>k=2</em>, <em>initMode=&quot;k-means||&quot;</em>, <em>initSteps=2</em>, <em>tol=1e-4</em>, <em>maxIter=20</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeans.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for KMeans.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.predictionCol" title="pyspark.ml.clustering.KMeans.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.seed" title="pyspark.ml.clustering.KMeans.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.KMeans.tol" title="pyspark.ml.clustering.KMeans.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeans.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.KMeans.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeans.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeans.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.KMeansModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">KMeansModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeansModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by KMeans.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.clusterCenters">
<code class="descname">clusterCenters</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeansModel.clusterCenters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.clusterCenters" title="Permalink to this definition"></a></dt>
<dd><p>Get the cluster centers, represented as a list of NumPy arrays.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.computeCost">
<code class="descname">computeCost</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#KMeansModel.computeCost"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.computeCost" title="Permalink to this definition"></a></dt>
<dd><p>Return the K-means cost (sum of squared distances of points to their nearest center)
for this model on the given data.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeansModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeansModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.KMeansModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
training set. An exception is thrown if no summary exists.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.KMeansModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.KMeansModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.GaussianMixture">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">GaussianMixture</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>predictionCol='prediction'</em>, <em>k=2</em>, <em>probabilityCol='probability'</em>, <em>tol=0.01</em>, <em>maxIter=100</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixture"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture" title="Permalink to this definition"></a></dt>
<dd><p>GaussianMixture clustering.
This class performs expectation maximization for multivariate Gaussian
Mixture Models (GMMs). A GMM represents a composite distribution of
independent Gaussian distributions with associated “mixing” weights
specifying each’s contribution to the composite.</p>
<p>Given a set of sample points, this class will maximize the log-likelihood
for a mixture of k Gaussians, iterating until the log-likelihood changes by
less than convergenceTol, or until it has reached the max number of iterations.
While this process is generally guaranteed to converge, it is not guaranteed
to find a global optimum.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">For high-dimensional data (with many features), this algorithm may perform poorly.
This is due to high-dimensional data (a) making it difficult to cluster at all
(based on statistical/theoretical arguments) and (b) numerical issues with
Gaussian distributions.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">0.1</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.05</span> <span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">0.01</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.1</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.9</span><span class="p">,</span> <span class="mf">0.8</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.75</span><span class="p">,</span> <span class="mf">0.935</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">0.83</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.68</span><span class="p">]),),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">0.91</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.76</span><span class="p">]),)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gm</span> <span class="o">=</span> <span class="n">GaussianMixture</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.0001</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">gm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">summary</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">k</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">clusterSizes</span>
<span class="go">[2, 2, 2]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span><span class="o">.</span><span class="n">logLikelihood</span>
<span class="go">8.14636...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">weights</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">weights</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">weights</span><span class="p">)</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">gaussiansDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(mean=DenseVector([0.825, 0.8675]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">gaussiansDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;cov&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">transformed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;prediction&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span> <span class="o">=</span> <span class="n">transformed</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">5</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rows</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span> <span class="o">==</span> <span class="n">rows</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gmm_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/gmm&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">gmm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gm2</span> <span class="o">=</span> <span class="n">GaussianMixture</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">gmm_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gm2</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/gmm_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">GaussianMixtureModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">hasSummary</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">weights</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">weights</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">gaussiansDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(mean=DenseVector([0.825, 0.8675]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span><span class="o">.</span><span class="n">gaussiansDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;cov&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="go">Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False))</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getK">
<code class="descname">getK</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixture.getK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getK" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <cite>k</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getProbabilityCol">
<code class="descname">getProbabilityCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of probabilityCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.k">
<code class="descname">k</code><em class="property"> = Param(parent='undefined', name='k', doc='Number of independent Gaussians in the mixture model. Must be &gt; 1.')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.k" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.probabilityCol">
<code class="descname">probabilityCol</code><em class="property"> = Param(parent='undefined', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.probabilityCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.featuresCol" title="pyspark.ml.clustering.GaussianMixture.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixture.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setK" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.k" title="pyspark.ml.clustering.GaussianMixture.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.maxIter" title="pyspark.ml.clustering.GaussianMixture.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>k=2</em>, <em>probabilityCol=&quot;probability&quot;</em>, <em>tol=0.01</em>, <em>maxIter=100</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixture.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for GaussianMixture.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.predictionCol" title="pyspark.ml.clustering.GaussianMixture.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setProbabilityCol">
<code class="descname">setProbabilityCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setProbabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.probabilityCol" title="pyspark.ml.clustering.GaussianMixture.probabilityCol"><code class="xref py py-attr docutils literal"><span class="pre">probabilityCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.seed" title="pyspark.ml.clustering.GaussianMixture.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.GaussianMixture.tol" title="pyspark.ml.clustering.GaussianMixture.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixture.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixture.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixture.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.GaussianMixtureModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">GaussianMixtureModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixtureModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by GaussianMixture.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.gaussiansDF">
<code class="descname">gaussiansDF</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.gaussiansDF" title="Permalink to this definition"></a></dt>
<dd><p>Retrieve Gaussian distributions as a DataFrame.
Each row represents a Gaussian Distribution.
The DataFrame has two columns: mean (Vector) and cov (Matrix).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
training set. An exception is thrown if no summary exists.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.weights">
<code class="descname">weights</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.weights" title="Permalink to this definition"></a></dt>
<dd><p>Weight for each Gaussian distribution in the mixture.
This is a multinomial probability distribution over the k Gaussians,
where weights[i] is the weight for Gaussian i, and weights sum to 1.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.GaussianMixtureModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">GaussianMixtureSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#GaussianMixtureSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Gaussian mixture clustering results for a given model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.cluster">
<code class="descname">cluster</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.cluster" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame of predicted cluster centers for each training data point.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.clusterSizes">
<code class="descname">clusterSizes</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.clusterSizes" title="Permalink to this definition"></a></dt>
<dd><p>Size of (number of data points in) each cluster.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Name for column of features in <cite>predictions</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.k">
<code class="descname">k</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.k" title="Permalink to this definition"></a></dt>
<dd><p>The number of clusters the model was trained with.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.logLikelihood">
<code class="descname">logLikelihood</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.logLikelihood" title="Permalink to this definition"></a></dt>
<dd><p>Total log-likelihood for this model on the given data.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Name for column of predicted clusters in <cite>predictions</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame produced by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.probability">
<code class="descname">probability</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.probability" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame of probabilities of each cluster for each training data point.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.GaussianMixtureSummary.probabilityCol">
<code class="descname">probabilityCol</code><a class="headerlink" href="#pyspark.ml.clustering.GaussianMixtureSummary.probabilityCol" title="Permalink to this definition"></a></dt>
<dd><p>Name for column of predicted probability of each cluster in <cite>predictions</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.LDA">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">LDA</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>maxIter=20</em>, <em>seed=None</em>, <em>checkpointInterval=10</em>, <em>k=10</em>, <em>optimizer='online'</em>, <em>learningOffset=1024.0</em>, <em>learningDecay=0.51</em>, <em>subsamplingRate=0.05</em>, <em>optimizeDocConcentration=True</em>, <em>docConcentration=None</em>, <em>topicConcentration=None</em>, <em>topicDistributionCol='topicDistribution'</em>, <em>keepLastCheckpoint=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA" title="Permalink to this definition"></a></dt>
<dd><p>Latent Dirichlet Allocation (LDA), a topic model designed for text documents.</p>
<p>Terminology:</p>
<blockquote>
<div><ul class="simple">
<li>“term” = “word”: an el</li>
<li>“token”: instance of a term appearing in a document</li>
<li>“topic”: multinomial distribution over terms representing some concept</li>
<li>“document”: one piece of text, corresponding to one row in the input data</li>
</ul>
</div></blockquote>
<dl class="docutils">
<dt>Original LDA paper (journal version):</dt>
<dd>Blei, Ng, and Jordan. “Latent Dirichlet Allocation.” JMLR, 2003.</dd>
</dl>
<p>Input data (featuresCol):
LDA is given a collection of documents as input data, via the featuresCol parameter.
Each document is specified as a <code class="xref py py-class docutils literal"><span class="pre">Vector</span></code> of length vocabSize, where each entry is the
count for the corresponding term (word) in the document. Feature transformers such as
<a class="reference internal" href="#pyspark.ml.feature.Tokenizer" title="pyspark.ml.feature.Tokenizer"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.feature.Tokenizer</span></code></a> and <a class="reference internal" href="#pyspark.ml.feature.CountVectorizer" title="pyspark.ml.feature.CountVectorizer"><code class="xref py py-class docutils literal"><span class="pre">pyspark.ml.feature.CountVectorizer</span></code></a>
can be useful for converting text to word count vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span><span class="p">,</span> <span class="n">SparseVector</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.clustering</span> <span class="k">import</span> <span class="n">LDA</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">})],],</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lda</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">optimizer</span><span class="o">=</span><span class="s2">&quot;em&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">lda</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">isDistributed</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">localModel</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">toLocal</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">localModel</span><span class="o">.</span><span class="n">isDistributed</span><span class="p">()</span>
<span class="go">False</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">describeTopics</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+-----+-----------+--------------------+</span>
<span class="go">|topic|termIndices| termWeights|</span>
<span class="go">+-----+-----------+--------------------+</span>
<span class="go">| 0| [1, 0]|[0.50401530077160...|</span>
<span class="go">| 1| [0, 1]|[0.50401530077160...|</span>
<span class="go">+-----+-----------+--------------------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">topicsMatrix</span><span class="p">()</span>
<span class="go">DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lda_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lda&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lda</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">lda_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameLDA</span> <span class="o">=</span> <span class="n">LDA</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">lda_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">distributed_model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lda_distributed_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">distributed_model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameModel</span> <span class="o">=</span> <span class="n">DistributedLDAModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">distributed_model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">local_model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lda_local_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">localModel</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">local_model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sameLocalModel</span> <span class="o">=</span> <span class="n">LocalLDAModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">local_model_path</span><span class="p">)</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.docConcentration">
<code class="descname">docConcentration</code><em class="property"> = Param(parent='undefined', name='docConcentration', doc='Concentration parameter (commonly named &quot;alpha&quot;) for the prior placed on documents\' distributions over topics (&quot;theta&quot;).')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.docConcentration" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getDocConcentration">
<code class="descname">getDocConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getDocConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">docConcentration</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getK">
<code class="descname">getK</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getK" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.k" title="pyspark.ml.clustering.LDA.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getKeepLastCheckpoint">
<code class="descname">getKeepLastCheckpoint</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getKeepLastCheckpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getKeepLastCheckpoint" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.keepLastCheckpoint" title="pyspark.ml.clustering.LDA.keepLastCheckpoint"><code class="xref py py-attr docutils literal"><span class="pre">keepLastCheckpoint</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getLearningDecay">
<code class="descname">getLearningDecay</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getLearningDecay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getLearningDecay" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.learningDecay" title="pyspark.ml.clustering.LDA.learningDecay"><code class="xref py py-attr docutils literal"><span class="pre">learningDecay</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getLearningOffset">
<code class="descname">getLearningOffset</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getLearningOffset"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getLearningOffset" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.learningOffset" title="pyspark.ml.clustering.LDA.learningOffset"><code class="xref py py-attr docutils literal"><span class="pre">learningOffset</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getOptimizeDocConcentration">
<code class="descname">getOptimizeDocConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getOptimizeDocConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getOptimizeDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="pyspark.ml.clustering.LDA.optimizeDocConcentration"><code class="xref py py-attr docutils literal"><span class="pre">optimizeDocConcentration</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getOptimizer">
<code class="descname">getOptimizer</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getOptimizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getOptimizer" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizer" title="pyspark.ml.clustering.LDA.optimizer"><code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getSubsamplingRate">
<code class="descname">getSubsamplingRate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getSubsamplingRate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.subsamplingRate" title="pyspark.ml.clustering.LDA.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getTopicConcentration">
<code class="descname">getTopicConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getTopicConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getTopicConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.topicConcentration" title="pyspark.ml.clustering.LDA.topicConcentration"><code class="xref py py-attr docutils literal"><span class="pre">topicConcentration</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.getTopicDistributionCol">
<code class="descname">getTopicDistributionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.getTopicDistributionCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.getTopicDistributionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.topicDistributionCol" title="pyspark.ml.clustering.LDA.topicDistributionCol"><code class="xref py py-attr docutils literal"><span class="pre">topicDistributionCol</span></code></a> or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.k">
<code class="descname">k</code><em class="property"> = Param(parent='undefined', name='k', doc='The number of topics (clusters) to infer. Must be &gt; 1.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.k" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.keepLastCheckpoint">
<code class="descname">keepLastCheckpoint</code><em class="property"> = Param(parent='undefined', name='keepLastCheckpoint', doc='(For EM optimizer) If using checkpointing, this indicates whether to keep the last checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with care.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.keepLastCheckpoint" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.learningDecay">
<code class="descname">learningDecay</code><em class="property"> = Param(parent='undefined', name='learningDecay', doc='Learning rate, set as anexponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic convergence.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.learningDecay" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.learningOffset">
<code class="descname">learningOffset</code><em class="property"> = Param(parent='undefined', name='learningOffset', doc='A (positive) learning parameter that downweights early iterations. Larger values make early iterations count less')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.learningOffset" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.optimizeDocConcentration">
<code class="descname">optimizeDocConcentration</code><em class="property"> = Param(parent='undefined', name='optimizeDocConcentration', doc='Indicates whether the docConcentration (Dirichlet parameter for document-topic distribution) will be optimized during training.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.optimizer">
<code class="descname">optimizer</code><em class="property"> = Param(parent='undefined', name='optimizer', doc='Optimizer or inference algorithm used to estimate the LDA model. Supported: online, em')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.optimizer" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.LDA.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.checkpointInterval" title="pyspark.ml.clustering.LDA.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setDocConcentration">
<code class="descname">setDocConcentration</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setDocConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">docConcentration</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setDocConcentration</span><span class="p">([</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getDocConcentration</span><span class="p">()</span>
<span class="go">[0.1..., 0.2...]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.featuresCol" title="pyspark.ml.clustering.LDA.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setK">
<code class="descname">setK</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setK"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setK" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.k" title="pyspark.ml.clustering.LDA.k"><code class="xref py py-attr docutils literal"><span class="pre">k</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setK</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getK</span><span class="p">()</span>
<span class="go">10</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setKeepLastCheckpoint">
<code class="descname">setKeepLastCheckpoint</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setKeepLastCheckpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setKeepLastCheckpoint" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.keepLastCheckpoint" title="pyspark.ml.clustering.LDA.keepLastCheckpoint"><code class="xref py py-attr docutils literal"><span class="pre">keepLastCheckpoint</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setKeepLastCheckpoint</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getKeepLastCheckpoint</span><span class="p">()</span>
<span class="go">False</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setLearningDecay">
<code class="descname">setLearningDecay</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setLearningDecay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setLearningDecay" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.learningDecay" title="pyspark.ml.clustering.LDA.learningDecay"><code class="xref py py-attr docutils literal"><span class="pre">learningDecay</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setLearningDecay</span><span class="p">(</span><span class="mf">0.1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getLearningDecay</span><span class="p">()</span>
<span class="go">0.1...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setLearningOffset">
<code class="descname">setLearningOffset</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setLearningOffset"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setLearningOffset" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.learningOffset" title="pyspark.ml.clustering.LDA.learningOffset"><code class="xref py py-attr docutils literal"><span class="pre">learningOffset</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setLearningOffset</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getLearningOffset</span><span class="p">()</span>
<span class="go">100.0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.maxIter" title="pyspark.ml.clustering.LDA.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setOptimizeDocConcentration">
<code class="descname">setOptimizeDocConcentration</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setOptimizeDocConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setOptimizeDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="pyspark.ml.clustering.LDA.optimizeDocConcentration"><code class="xref py py-attr docutils literal"><span class="pre">optimizeDocConcentration</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setOptimizeDocConcentration</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getOptimizeDocConcentration</span><span class="p">()</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setOptimizer">
<code class="descname">setOptimizer</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setOptimizer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setOptimizer" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizer" title="pyspark.ml.clustering.LDA.optimizer"><code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code></a>.
Currenlty only support ‘em’ and ‘online’.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setOptimizer</span><span class="p">(</span><span class="s2">&quot;em&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getOptimizer</span><span class="p">()</span>
<span class="go">&#39;em&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>maxIter=20</em>, <em>seed=None</em>, <em>checkpointInterval=10</em>, <em>k=10</em>, <em>optimizer='online'</em>, <em>learningOffset=1024.0</em>, <em>learningDecay=0.51</em>, <em>subsamplingRate=0.05</em>, <em>optimizeDocConcentration=True</em>, <em>docConcentration=None</em>, <em>topicConcentration=None</em>, <em>topicDistributionCol='topicDistribution'</em>, <em>keepLastCheckpoint=True</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, featuresCol=”features”, maxIter=20, seed=None, checkpointInterval=10, k=10, optimizer=”online”, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None, topicConcentration=None, topicDistributionCol=”topicDistribution”, keepLastCheckpoint=True):</p>
<p>Sets params for LDA.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.seed" title="pyspark.ml.clustering.LDA.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setSubsamplingRate">
<code class="descname">setSubsamplingRate</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setSubsamplingRate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.subsamplingRate" title="pyspark.ml.clustering.LDA.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setSubsamplingRate</span><span class="p">(</span><span class="mf">0.1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getSubsamplingRate</span><span class="p">()</span>
<span class="go">0.1...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setTopicConcentration">
<code class="descname">setTopicConcentration</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setTopicConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setTopicConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.topicConcentration" title="pyspark.ml.clustering.LDA.topicConcentration"><code class="xref py py-attr docutils literal"><span class="pre">topicConcentration</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setTopicConcentration</span><span class="p">(</span><span class="mf">0.5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getTopicConcentration</span><span class="p">()</span>
<span class="go">0.5...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.setTopicDistributionCol">
<code class="descname">setTopicDistributionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDA.setTopicDistributionCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDA.setTopicDistributionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.clustering.LDA.topicDistributionCol" title="pyspark.ml.clustering.LDA.topicDistributionCol"><code class="xref py py-attr docutils literal"><span class="pre">topicDistributionCol</span></code></a>.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span> <span class="o">=</span> <span class="n">LDA</span><span class="p">()</span><span class="o">.</span><span class="n">setTopicDistributionCol</span><span class="p">(</span><span class="s2">&quot;topicDistributionCol&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">algo</span><span class="o">.</span><span class="n">getTopicDistributionCol</span><span class="p">()</span>
<span class="go">&#39;topicDistributionCol&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.subsamplingRate">
<code class="descname">subsamplingRate</code><em class="property"> = Param(parent='undefined', name='subsamplingRate', doc='Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1].')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.subsamplingRate" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.topicConcentration">
<code class="descname">topicConcentration</code><em class="property"> = Param(parent='undefined', name='topicConcentration', doc='Concentration parameter (commonly named &quot;beta&quot; or &quot;eta&quot;) for the prior placed on topic\' distributions over terms.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.topicConcentration" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDA.topicDistributionCol">
<code class="descname">topicDistributionCol</code><em class="property"> = Param(parent='undefined', name='topicDistributionCol', doc='Output column with estimates of the topic mixture distribution for each document (often called &quot;theta&quot; in the literature). Returns a vector of zeros for an empty document.')</em><a class="headerlink" href="#pyspark.ml.clustering.LDA.topicDistributionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDA.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDA.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.LDAModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">LDAModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel" title="Permalink to this definition"></a></dt>
<dd><p>Latent Dirichlet Allocation (LDA) model.
This abstraction permits for different underlying representations,
including local and distributed data structures.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.describeTopics">
<code class="descname">describeTopics</code><span class="sig-paren">(</span><em>maxTermsPerTopic=10</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.describeTopics"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.describeTopics" title="Permalink to this definition"></a></dt>
<dd><p>Return the topics described by their top-weighted terms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.estimatedDocConcentration">
<code class="descname">estimatedDocConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.estimatedDocConcentration"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.estimatedDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Value for <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> estimated from data.
If Online LDA was used and <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="pyspark.ml.clustering.LDA.optimizeDocConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.optimizeDocConcentration</span></code></a> was set to false,
then this returns the fixed (given) value for the <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> parameter.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.isDistributed">
<code class="descname">isDistributed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.isDistributed"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.isDistributed" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether this instance is of type DistributedLDAModel</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.logLikelihood">
<code class="descname">logLikelihood</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.logLikelihood"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.logLikelihood" title="Permalink to this definition"></a></dt>
<dd><p>Calculates a lower bound on the log likelihood of the entire corpus.
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.LDAModel.topicsMatrix" title="pyspark.ml.clustering.LDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.logPerplexity">
<code class="descname">logPerplexity</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.logPerplexity"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.logPerplexity" title="Permalink to this definition"></a></dt>
<dd><p>Calculate an upper bound on perplexity. (Lower is better.)
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.LDAModel.topicsMatrix" title="pyspark.ml.clustering.LDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LDAModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.topicsMatrix">
<code class="descname">topicsMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.topicsMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.topicsMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Inferred topics, where each topic is represented by a distribution over terms.
This is a matrix of size vocabSize x k, where each column is a topic.
No guarantees are given about the ordering of the topics.</p>
<p>WARNING: If this model is actually a <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> instance produced by
the Expectation-Maximization (“em”) <cite>optimizer</cite>, then this method could involve
collecting a large amount of data to the driver (on the order of vocabSize x k).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LDAModel.vocabSize">
<code class="descname">vocabSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LDAModel.vocabSize"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LDAModel.vocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Vocabulary size (number of terms or words in the vocabulary)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.LocalLDAModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">LocalLDAModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#LocalLDAModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel" title="Permalink to this definition"></a></dt>
<dd><p>Local (non-distributed) model fitted by <a class="reference internal" href="#pyspark.ml.clustering.LDA" title="pyspark.ml.clustering.LDA"><code class="xref py py-class docutils literal"><span class="pre">LDA</span></code></a>.
This model stores the inferred topics only; it does not store info about the training dataset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.describeTopics">
<code class="descname">describeTopics</code><span class="sig-paren">(</span><em>maxTermsPerTopic=10</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.describeTopics" title="Permalink to this definition"></a></dt>
<dd><p>Return the topics described by their top-weighted terms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.estimatedDocConcentration">
<code class="descname">estimatedDocConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.estimatedDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Value for <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> estimated from data.
If Online LDA was used and <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="pyspark.ml.clustering.LDA.optimizeDocConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.optimizeDocConcentration</span></code></a> was set to false,
then this returns the fixed (given) value for the <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> parameter.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.isDistributed">
<code class="descname">isDistributed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.isDistributed" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether this instance is of type DistributedLDAModel</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.logLikelihood">
<code class="descname">logLikelihood</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.logLikelihood" title="Permalink to this definition"></a></dt>
<dd><p>Calculates a lower bound on the log likelihood of the entire corpus.
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.LocalLDAModel.topicsMatrix" title="pyspark.ml.clustering.LocalLDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.logPerplexity">
<code class="descname">logPerplexity</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.logPerplexity" title="Permalink to this definition"></a></dt>
<dd><p>Calculate an upper bound on perplexity. (Lower is better.)
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.LocalLDAModel.topicsMatrix" title="pyspark.ml.clustering.LocalLDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.LocalLDAModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.topicsMatrix">
<code class="descname">topicsMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.topicsMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Inferred topics, where each topic is represented by a distribution over terms.
This is a matrix of size vocabSize x k, where each column is a topic.
No guarantees are given about the ordering of the topics.</p>
<p>WARNING: If this model is actually a <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> instance produced by
the Expectation-Maximization (“em”) <cite>optimizer</cite>, then this method could involve
collecting a large amount of data to the driver (on the order of vocabSize x k).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.vocabSize">
<code class="descname">vocabSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.vocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Vocabulary size (number of terms or words in the vocabulary)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.LocalLDAModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.LocalLDAModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.clustering.DistributedLDAModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.clustering.</code><code class="descname">DistributedLDAModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#DistributedLDAModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel" title="Permalink to this definition"></a></dt>
<dd><p>Distributed model fitted by <a class="reference internal" href="#pyspark.ml.clustering.LDA" title="pyspark.ml.clustering.LDA"><code class="xref py py-class docutils literal"><span class="pre">LDA</span></code></a>.
This type of model is currently only produced by Expectation-Maximization (EM).</p>
<p>This model stores the inferred topics, the full training dataset, and the topic distribution
for each training document.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.describeTopics">
<code class="descname">describeTopics</code><span class="sig-paren">(</span><em>maxTermsPerTopic=10</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.describeTopics" title="Permalink to this definition"></a></dt>
<dd><p>Return the topics described by their top-weighted terms.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.estimatedDocConcentration">
<code class="descname">estimatedDocConcentration</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.estimatedDocConcentration" title="Permalink to this definition"></a></dt>
<dd><p>Value for <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> estimated from data.
If Online LDA was used and <a class="reference internal" href="#pyspark.ml.clustering.LDA.optimizeDocConcentration" title="pyspark.ml.clustering.LDA.optimizeDocConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.optimizeDocConcentration</span></code></a> was set to false,
then this returns the fixed (given) value for the <a class="reference internal" href="#pyspark.ml.clustering.LDA.docConcentration" title="pyspark.ml.clustering.LDA.docConcentration"><code class="xref py py-attr docutils literal"><span class="pre">LDA.docConcentration</span></code></a> parameter.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.getCheckpointFiles">
<code class="descname">getCheckpointFiles</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#DistributedLDAModel.getCheckpointFiles"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.getCheckpointFiles" title="Permalink to this definition"></a></dt>
<dd><p>If using checkpointing and <a class="reference internal" href="#pyspark.ml.clustering.LDA.keepLastCheckpoint" title="pyspark.ml.clustering.LDA.keepLastCheckpoint"><code class="xref py py-attr docutils literal"><span class="pre">LDA.keepLastCheckpoint</span></code></a> is set to true, then there may
be saved checkpoint files. This method is provided so that users can manage those files.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Removing the checkpoints can cause failures if a partition is lost and is needed
by certain <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> methods. Reference counting will clean up
the checkpoints when this model and derivative data go out of scope.</p>
</div>
<p>:return List of checkpoint files from training</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.isDistributed">
<code class="descname">isDistributed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.isDistributed" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether this instance is of type DistributedLDAModel</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.logLikelihood">
<code class="descname">logLikelihood</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.logLikelihood" title="Permalink to this definition"></a></dt>
<dd><p>Calculates a lower bound on the log likelihood of the entire corpus.
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.topicsMatrix" title="pyspark.ml.clustering.DistributedLDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.logPerplexity">
<code class="descname">logPerplexity</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.logPerplexity" title="Permalink to this definition"></a></dt>
<dd><p>Calculate an upper bound on perplexity. (Lower is better.)
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).</p>
<p>WARNING: If this model is an instance of <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> (produced when
<code class="xref py py-attr docutils literal"><span class="pre">optimizer</span></code> is set to “em”), this involves collecting a large
<a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.topicsMatrix" title="pyspark.ml.clustering.DistributedLDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver. This implementation may be changed in the future.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.logPrior">
<code class="descname">logPrior</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#DistributedLDAModel.logPrior"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.logPrior" title="Permalink to this definition"></a></dt>
<dd><p>Log probability of the current parameter estimate:
log P(topics, topic distributions for docs | alpha, eta)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.clustering.DistributedLDAModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.toLocal">
<code class="descname">toLocal</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#DistributedLDAModel.toLocal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.toLocal" title="Permalink to this definition"></a></dt>
<dd><p>Convert this distributed model to a local representation. This discards info about the
training dataset.</p>
<p>WARNING: This involves collecting a large <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.topicsMatrix" title="pyspark.ml.clustering.DistributedLDAModel.topicsMatrix"><code class="xref py py-func docutils literal"><span class="pre">topicsMatrix()</span></code></a> to the driver.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.topicsMatrix">
<code class="descname">topicsMatrix</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.topicsMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Inferred topics, where each topic is represented by a distribution over terms.
This is a matrix of size vocabSize x k, where each column is a topic.
No guarantees are given about the ordering of the topics.</p>
<p>WARNING: If this model is actually a <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel" title="pyspark.ml.clustering.DistributedLDAModel"><code class="xref py py-class docutils literal"><span class="pre">DistributedLDAModel</span></code></a> instance produced by
the Expectation-Maximization (“em”) <cite>optimizer</cite>, then this method could involve
collecting a large amount of data to the driver (on the order of vocabSize x k).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.trainingLogLikelihood">
<code class="descname">trainingLogLikelihood</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/clustering.html#DistributedLDAModel.trainingLogLikelihood"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.trainingLogLikelihood" title="Permalink to this definition"></a></dt>
<dd><p>Log likelihood of the observed tokens in the training set,
given the current parameter estimates:
log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)</p>
<dl class="docutils">
<dt>Notes:</dt>
<dd><ul class="first last simple">
<li>This excludes the prior; for that, use <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.logPrior" title="pyspark.ml.clustering.DistributedLDAModel.logPrior"><code class="xref py py-func docutils literal"><span class="pre">logPrior()</span></code></a>.</li>
<li>Even with <a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.logPrior" title="pyspark.ml.clustering.DistributedLDAModel.logPrior"><code class="xref py py-func docutils literal"><span class="pre">logPrior()</span></code></a>, this is NOT the same as the data log likelihood given
the hyperparameters.</li>
<li>This is computed from the topic distributions computed during training. If you call
<a class="reference internal" href="#pyspark.ml.clustering.DistributedLDAModel.logLikelihood" title="pyspark.ml.clustering.DistributedLDAModel.logLikelihood"><code class="xref py py-func docutils literal"><span class="pre">logLikelihood()</span></code></a> on the same training dataset, the topic distributions
will be computed again, possibly giving different results.</li>
</ul>
</dd>
</dl>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.vocabSize">
<code class="descname">vocabSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.vocabSize" title="Permalink to this definition"></a></dt>
<dd><p>Vocabulary size (number of terms or words in the vocabulary)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.clustering.DistributedLDAModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.clustering.DistributedLDAModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.linalg">
<span id="pyspark-ml-linalg-module"></span><h2>pyspark.ml.linalg module<a class="headerlink" href="#module-pyspark.ml.linalg" title="Permalink to this headline"></a></h2>
<p>MLlib utilities for linear algebra. For dense vectors, MLlib
uses the NumPy <code class="xref py py-class docutils literal"><span class="pre">array</span></code> type, so you can simply pass NumPy arrays
around. For sparse vectors, users can construct a <a class="reference internal" href="#pyspark.ml.linalg.SparseVector" title="pyspark.ml.linalg.SparseVector"><code class="xref py py-class docutils literal"><span class="pre">SparseVector</span></code></a>
object from MLlib or pass SciPy <code class="xref py py-class docutils literal"><span class="pre">scipy.sparse</span></code> column vectors if
SciPy is available in their environment.</p>
<dl class="class">
<dt id="pyspark.ml.linalg.Vector">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">Vector</code><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vector" title="Permalink to this definition"></a></dt>
<dd><dl class="method">
<dt id="pyspark.ml.linalg.Vector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Convert the vector into an numpy.ndarray</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">numpy.ndarray</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.DenseVector">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">DenseVector</code><span class="sig-paren">(</span><em>ar</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector" title="Permalink to this definition"></a></dt>
<dd><p>A dense vector represented by a value array. We use numpy array for
storage and arithmetics will be delegated to the underlying numpy
array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">+</span> <span class="n">u</span>
<span class="go">DenseVector([4.0, 6.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="mi">2</span> <span class="o">-</span> <span class="n">v</span>
<span class="go">DenseVector([1.0, 0.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">/</span> <span class="mi">2</span>
<span class="go">DenseVector([0.5, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">v</span> <span class="o">*</span> <span class="n">u</span>
<span class="go">DenseVector([3.0, 8.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">/</span> <span class="n">v</span>
<span class="go">DenseVector([3.0, 2.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">%</span> <span class="mi">2</span>
<span class="go">DenseVector([1.0, 0.0])</span>
</pre></div>
</div>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseVector.dot">
<code class="descname">dot</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector.dot"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.dot" title="Permalink to this definition"></a></dt>
<dd><p>Compute the dot product of two Vectors. We support
(Numpy array, list, SparseVector, or SciPy sparse)
and a target NumPy array that is either 1- or 2-dimensional.
Equivalent to calling numpy.dot of the two vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">dense</span><span class="p">)</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">]))</span>
<span class="go">4.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">))</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">)))</span>
<span class="go">5.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">([</span><span class="mf">1.</span><span class="p">,])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">reshape</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">],</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="n">order</span><span class="o">=</span><span class="s1">&#39;F&#39;</span><span class="p">))</span>
<span class="go">array([ 5., 11.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">reshape</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">],</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">order</span><span class="o">=</span><span class="s1">&#39;F&#39;</span><span class="p">))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseVector.norm">
<code class="descname">norm</code><span class="sig-paren">(</span><em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.norm" title="Permalink to this definition"></a></dt>
<dd><p>Calculates the norm of a DenseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="o">-</span><span class="mi">3</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="go">3.7...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">6.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseVector.numNonzeros">
<code class="descname">numNonzeros</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector.numNonzeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.numNonzeros" title="Permalink to this definition"></a></dt>
<dd><p>Number of nonzero elements. This scans all active values and count non zeros</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseVector.squared_distance">
<code class="descname">squared_distance</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance of two Vectors.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span> <span class="o">=</span> <span class="n">DenseVector</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense1</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense2</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense3</span> <span class="o">=</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">dense3</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">sparse1</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">1.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">sparse1</span><span class="p">)</span>
<span class="go">2.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">([</span><span class="mf">1.</span><span class="p">,])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dense1</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,],</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseVector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseVector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns an numpy.ndarray</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.linalg.DenseVector.values">
<code class="descname">values</code><a class="headerlink" href="#pyspark.ml.linalg.DenseVector.values" title="Permalink to this definition"></a></dt>
<dd><p>Returns a list of values</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.SparseVector">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">SparseVector</code><span class="sig-paren">(</span><em>size</em>, <em>*args</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector" title="Permalink to this definition"></a></dt>
<dd><p>A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy’s {scipy.sparse} data types.</p>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseVector.dot">
<code class="descname">dot</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector.dot"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.dot" title="Permalink to this definition"></a></dt>
<dd><p>Dot product with a SparseVector or 1- or 2-dimensional Numpy array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">25.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">22.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">]]))</span>
<span class="go">array([ 22., 22.])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">DenseVector</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.linalg.SparseVector.indices">
<code class="descname">indices</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.indices" title="Permalink to this definition"></a></dt>
<dd><p>A list of indices corresponding to active entries.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseVector.norm">
<code class="descname">norm</code><span class="sig-paren">(</span><em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.norm" title="Permalink to this definition"></a></dt>
<dd><p>Calculates the norm of a SparseVector.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.</span><span class="p">,</span> <span class="o">-</span><span class="mf">4.</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">7.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="go">5.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseVector.numNonzeros">
<code class="descname">numNonzeros</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector.numNonzeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.numNonzeros" title="Permalink to this definition"></a></dt>
<dd><p>Number of nonzero elements. This scans all active values and count non zeros.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.linalg.SparseVector.size">
<code class="descname">size</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.size" title="Permalink to this definition"></a></dt>
<dd><p>Size of the vector.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseVector.squared_distance">
<code class="descname">squared_distance</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance from a SparseVector or 1-dimensional NumPy array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">array</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="s1">&#39;d&#39;</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">11.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">]))</span>
<span class="go">11.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">SparseVector</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">26.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
<span class="go">26.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">])</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">SparseVector</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,]))</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">AssertionError</span>: <span class="n">dimension mismatch</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseVector.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseVector.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns a copy of this SparseVector as a 1-dimensional NumPy array.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.linalg.SparseVector.values">
<code class="descname">values</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.linalg.SparseVector.values" title="Permalink to this definition"></a></dt>
<dd><p>A list of values corresponding to active entries.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.Vectors">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">Vectors</code><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors" title="Permalink to this definition"></a></dt>
<dd><p>Factory methods for working with vectors.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Dense vectors are simply represented as NumPy array objects,
so there is no need to covert them for use in MLlib. For sparse vectors,
the factory methods in this class create an MLlib-compatible type, or users
can pass in SciPy’s <code class="xref py py-class docutils literal"><span class="pre">scipy.sparse</span></code> column vectors.</p>
</div>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Vectors.dense">
<em class="property">static </em><code class="descname">dense</code><span class="sig-paren">(</span><em>*elements</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors.dense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors.dense" title="Permalink to this definition"></a></dt>
<dd><p>Create a dense vector of 64-bit floats from a Python list or numbers.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
<span class="go">DenseVector([1.0, 2.0, 3.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="go">DenseVector([1.0, 2.0])</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Vectors.norm">
<em class="property">static </em><code class="descname">norm</code><span class="sig-paren">(</span><em>vector</em>, <em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors.norm"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors.norm" title="Permalink to this definition"></a></dt>
<dd><p>Find norm of the given vector.</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Vectors.sparse">
<em class="property">static </em><code class="descname">sparse</code><span class="sig-paren">(</span><em>size</em>, <em>*args</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors.sparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors.sparse" title="Permalink to this definition"></a></dt>
<dd><p>Create a sparse vector, using either a dictionary, a list of
(index, value) pairs, or two separate arrays of indices and
values (sorted by index).</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>size</strong> – Size of the vector.</li>
<li><strong>args</strong> – Non-zero entries, as a dictionary, list of tuples,
or two sorted lists containing indices and values.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mi">3</span><span class="p">:</span> <span class="mf">5.5</span><span class="p">})</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">5.5</span><span class="p">)])</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">5.5</span><span class="p">])</span>
<span class="go">SparseVector(4, {1: 1.0, 3: 5.5})</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Vectors.squared_distance">
<em class="property">static </em><code class="descname">squared_distance</code><span class="sig-paren">(</span><em>v1</em>, <em>v2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors.squared_distance"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors.squared_distance" title="Permalink to this definition"></a></dt>
<dd><p>Squared distance between two vectors.
a and b can be of type SparseVector, DenseVector, np.ndarray
or array.array.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">a</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">b</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">a</span><span class="o">.</span><span class="n">squared_distance</span><span class="p">(</span><span class="n">b</span><span class="p">)</span>
<span class="go">51.0</span>
</pre></div>
</div>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Vectors.zeros">
<em class="property">static </em><code class="descname">zeros</code><span class="sig-paren">(</span><em>size</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Vectors.zeros"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Vectors.zeros" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.Matrix">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">Matrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Matrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Matrix" title="Permalink to this definition"></a></dt>
<dd><dl class="method">
<dt id="pyspark.ml.linalg.Matrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Matrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Matrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Returns its elements in a NumPy ndarray.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.DenseMatrix">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">DenseMatrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>values</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Column-major dense matrix.</p>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseMatrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseMatrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseMatrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Return an numpy.ndarray</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">m</span> <span class="o">=</span> <span class="n">DenseMatrix</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">m</span><span class="o">.</span><span class="n">toArray</span><span class="p">()</span>
<span class="go">array([[ 0., 2.],</span>
<span class="go"> [ 1., 3.]])</span>
</pre></div>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.DenseMatrix.toSparse">
<code class="descname">toSparse</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#DenseMatrix.toSparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.DenseMatrix.toSparse" title="Permalink to this definition"></a></dt>
<dd><p>Convert to SparseMatrix</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.SparseMatrix">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">SparseMatrix</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>colPtrs</em>, <em>rowIndices</em>, <em>values</em>, <em>isTransposed=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseMatrix"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseMatrix" title="Permalink to this definition"></a></dt>
<dd><p>Sparse Matrix stored in CSC format.</p>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseMatrix.toArray">
<code class="descname">toArray</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseMatrix.toArray"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseMatrix.toArray" title="Permalink to this definition"></a></dt>
<dd><p>Return an numpy.ndarray</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.linalg.SparseMatrix.toDense">
<code class="descname">toDense</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#SparseMatrix.toDense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.SparseMatrix.toDense" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.linalg.Matrices">
<em class="property">class </em><code class="descclassname">pyspark.ml.linalg.</code><code class="descname">Matrices</code><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Matrices"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Matrices" title="Permalink to this definition"></a></dt>
<dd><dl class="staticmethod">
<dt id="pyspark.ml.linalg.Matrices.dense">
<em class="property">static </em><code class="descname">dense</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>values</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Matrices.dense"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Matrices.dense" title="Permalink to this definition"></a></dt>
<dd><p>Create a DenseMatrix</p>
</dd></dl>
<dl class="staticmethod">
<dt id="pyspark.ml.linalg.Matrices.sparse">
<em class="property">static </em><code class="descname">sparse</code><span class="sig-paren">(</span><em>numRows</em>, <em>numCols</em>, <em>colPtrs</em>, <em>rowIndices</em>, <em>values</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/linalg.html#Matrices.sparse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.linalg.Matrices.sparse" title="Permalink to this definition"></a></dt>
<dd><p>Create a SparseMatrix</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.recommendation">
<span id="pyspark-ml-recommendation-module"></span><h2>pyspark.ml.recommendation module<a class="headerlink" href="#module-pyspark.ml.recommendation" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.recommendation.ALS">
<em class="property">class </em><code class="descclassname">pyspark.ml.recommendation.</code><code class="descname">ALS</code><span class="sig-paren">(</span><em>rank=10</em>, <em>maxIter=10</em>, <em>regParam=0.1</em>, <em>numUserBlocks=10</em>, <em>numItemBlocks=10</em>, <em>implicitPrefs=False</em>, <em>alpha=1.0</em>, <em>userCol='user'</em>, <em>itemCol='item'</em>, <em>seed=None</em>, <em>ratingCol='rating'</em>, <em>nonnegative=False</em>, <em>checkpointInterval=10</em>, <em>intermediateStorageLevel='MEMORY_AND_DISK'</em>, <em>finalStorageLevel='MEMORY_AND_DISK'</em>, <em>coldStartStrategy='nan'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS" title="Permalink to this definition"></a></dt>
<dd><p>Alternating Least Squares (ALS) matrix factorization.</p>
<p>ALS attempts to estimate the ratings matrix <cite>R</cite> as the product of
two lower-rank matrices, <cite>X</cite> and <cite>Y</cite>, i.e. <cite>X * Yt = R</cite>. Typically
these approximations are called ‘factor’ matrices. The general
approach is iterative. During each iteration, one of the factor
matrices is held constant, while the other is solved for using least
squares. The newly-solved factor matrix is then held constant while
solving for the other factor matrix.</p>
<p>This is a blocked implementation of the ALS factorization algorithm
that groups the two sets of factors (referred to as “users” and
“products”) into blocks and reduces communication by only sending
one copy of each user vector to each product block on each
iteration, and only for the product blocks that need that user’s
feature vector. This is achieved by pre-computing some information
about the ratings matrix to determine the “out-links” of each user
(which blocks of products it will contribute to) and “in-link”
information for each product (which of the feature vectors it
receives from each user block it will depend on). This allows us to
send only an array of feature vectors between each user block and
product block, and have the product block find the users’ ratings
and update the products based on these messages.</p>
<p>For implicit preference data, the algorithm used is based on
<a class="reference external" href="http://dx.doi.org/10.1109/ICDM.2008.22">“Collaborative Filtering for Implicit Feedback Datasets”,</a>, adapted for the blocked
approach used here.</p>
<p>Essentially instead of finding the low-rank approximations to the
rating matrix <cite>R</cite>, this finds the approximations for a preference
matrix <cite>P</cite> where the elements of <cite>P</cite> are 1 if r &gt; 0 and 0 if r &lt;= 0.
The ratings then act as ‘confidence’ values related to strength of
indicated user preferences rather than explicit ratings given to
items.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)],</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;item&quot;</span><span class="p">,</span> <span class="s2">&quot;rating&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">als</span> <span class="o">=</span> <span class="n">ALS</span><span class="p">(</span><span class="n">rank</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">als</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">rank</span>
<span class="go">10</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">userFactors</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(id=0, features=[...]), Row(id=1, ...), Row(id=2, ...)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;item&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">predictions</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">r</span><span class="p">:</span> <span class="n">r</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">predictions</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">Row(user=0, item=2, prediction=-0.13807615637779236)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">predictions</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="go">Row(user=1, item=0, prediction=2.6258413791656494)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">predictions</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
<span class="go">Row(user=2, item=0, prediction=-1.5018409490585327)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">user_recs</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">recommendForAllUsers</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">user_recs</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">user_recs</span><span class="o">.</span><span class="n">user</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;recommendations.item&quot;</span><span class="p">,</span> <span class="s2">&quot;recommendations.rating&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(item=[0, 1, 2], rating=[3.910..., 1.992..., -0.138...])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">item_recs</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">recommendForAllItems</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">item_recs</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">item_recs</span><span class="o">.</span><span class="n">item</span> <span class="o">==</span> <span class="mi">2</span><span class="p">)</span> <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;recommendations.user&quot;</span><span class="p">,</span> <span class="s2">&quot;recommendations.rating&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="go">[Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">als_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/als&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">als</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">als_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">als2</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">als_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">als</span><span class="o">.</span><span class="n">getMaxIter</span><span class="p">()</span>
<span class="go">5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/als_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">ALSModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">rank</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">rank</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">userFactors</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span> <span class="o">==</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">model2</span><span class="o">.</span><span class="n">userFactors</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">itemFactors</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span> <span class="o">==</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">model2</span><span class="o">.</span><span class="n">itemFactors</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.alpha">
<code class="descname">alpha</code><em class="property"> = Param(parent='undefined', name='alpha', doc='alpha for implicit preference')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.alpha" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.coldStartStrategy">
<code class="descname">coldStartStrategy</code><em class="property"> = Param(parent='undefined', name='coldStartStrategy', doc=&quot;strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'.&quot;)</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.coldStartStrategy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.finalStorageLevel">
<code class="descname">finalStorageLevel</code><em class="property"> = Param(parent='undefined', name='finalStorageLevel', doc='StorageLevel for ALS model factors.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.finalStorageLevel" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getAlpha">
<code class="descname">getAlpha</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getAlpha"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getAlpha" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of alpha or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getColdStartStrategy">
<code class="descname">getColdStartStrategy</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getColdStartStrategy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getColdStartStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of coldStartStrategy or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getFinalStorageLevel">
<code class="descname">getFinalStorageLevel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getFinalStorageLevel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getFinalStorageLevel" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of finalStorageLevel or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getImplicitPrefs">
<code class="descname">getImplicitPrefs</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getImplicitPrefs"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getImplicitPrefs" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of implicitPrefs or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getIntermediateStorageLevel">
<code class="descname">getIntermediateStorageLevel</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getIntermediateStorageLevel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getIntermediateStorageLevel" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of intermediateStorageLevel or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getItemCol">
<code class="descname">getItemCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getItemCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getItemCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of itemCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getNonnegative">
<code class="descname">getNonnegative</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getNonnegative"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getNonnegative" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of nonnegative or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getNumItemBlocks">
<code class="descname">getNumItemBlocks</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getNumItemBlocks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getNumItemBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numItemBlocks or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getNumUserBlocks">
<code class="descname">getNumUserBlocks</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getNumUserBlocks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getNumUserBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numUserBlocks or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getRank">
<code class="descname">getRank</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getRank"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getRank" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rank or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getRatingCol">
<code class="descname">getRatingCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getRatingCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getRatingCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of ratingCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getRegParam">
<code class="descname">getRegParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of regParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.getUserCol">
<code class="descname">getUserCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.getUserCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.getUserCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of userCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.implicitPrefs">
<code class="descname">implicitPrefs</code><em class="property"> = Param(parent='undefined', name='implicitPrefs', doc='whether to use implicit preference')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.implicitPrefs" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.intermediateStorageLevel">
<code class="descname">intermediateStorageLevel</code><em class="property"> = Param(parent='undefined', name='intermediateStorageLevel', doc=&quot;StorageLevel for intermediate datasets. Cannot be 'NONE'.&quot;)</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.intermediateStorageLevel" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.itemCol">
<code class="descname">itemCol</code><em class="property"> = Param(parent='undefined', name='itemCol', doc='column name for item ids. Ids must be within the integer value range.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.itemCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.nonnegative">
<code class="descname">nonnegative</code><em class="property"> = Param(parent='undefined', name='nonnegative', doc='whether to use nonnegative constraint for least squares')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.nonnegative" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.numItemBlocks">
<code class="descname">numItemBlocks</code><em class="property"> = Param(parent='undefined', name='numItemBlocks', doc='number of item blocks')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.numItemBlocks" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.numUserBlocks">
<code class="descname">numUserBlocks</code><em class="property"> = Param(parent='undefined', name='numUserBlocks', doc='number of user blocks')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.numUserBlocks" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.recommendation.ALS.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.rank">
<code class="descname">rank</code><em class="property"> = Param(parent='undefined', name='rank', doc='rank of the factorization')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.rank" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.ratingCol">
<code class="descname">ratingCol</code><em class="property"> = Param(parent='undefined', name='ratingCol', doc='column name for ratings')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.ratingCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.regParam">
<code class="descname">regParam</code><em class="property"> = Param(parent='undefined', name='regParam', doc='regularization parameter (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.regParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setAlpha">
<code class="descname">setAlpha</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setAlpha"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setAlpha" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.alpha" title="pyspark.ml.recommendation.ALS.alpha"><code class="xref py py-attr docutils literal"><span class="pre">alpha</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.checkpointInterval" title="pyspark.ml.recommendation.ALS.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setColdStartStrategy">
<code class="descname">setColdStartStrategy</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setColdStartStrategy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setColdStartStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.coldStartStrategy" title="pyspark.ml.recommendation.ALS.coldStartStrategy"><code class="xref py py-attr docutils literal"><span class="pre">coldStartStrategy</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setFinalStorageLevel">
<code class="descname">setFinalStorageLevel</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setFinalStorageLevel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setFinalStorageLevel" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.finalStorageLevel" title="pyspark.ml.recommendation.ALS.finalStorageLevel"><code class="xref py py-attr docutils literal"><span class="pre">finalStorageLevel</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setImplicitPrefs">
<code class="descname">setImplicitPrefs</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setImplicitPrefs"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setImplicitPrefs" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.implicitPrefs" title="pyspark.ml.recommendation.ALS.implicitPrefs"><code class="xref py py-attr docutils literal"><span class="pre">implicitPrefs</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setIntermediateStorageLevel">
<code class="descname">setIntermediateStorageLevel</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setIntermediateStorageLevel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setIntermediateStorageLevel" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.intermediateStorageLevel" title="pyspark.ml.recommendation.ALS.intermediateStorageLevel"><code class="xref py py-attr docutils literal"><span class="pre">intermediateStorageLevel</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setItemCol">
<code class="descname">setItemCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setItemCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setItemCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.itemCol" title="pyspark.ml.recommendation.ALS.itemCol"><code class="xref py py-attr docutils literal"><span class="pre">itemCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.maxIter" title="pyspark.ml.recommendation.ALS.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setNonnegative">
<code class="descname">setNonnegative</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setNonnegative"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setNonnegative" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.nonnegative" title="pyspark.ml.recommendation.ALS.nonnegative"><code class="xref py py-attr docutils literal"><span class="pre">nonnegative</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setNumBlocks">
<code class="descname">setNumBlocks</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setNumBlocks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setNumBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Sets both <a class="reference internal" href="#pyspark.ml.recommendation.ALS.numUserBlocks" title="pyspark.ml.recommendation.ALS.numUserBlocks"><code class="xref py py-attr docutils literal"><span class="pre">numUserBlocks</span></code></a> and <a class="reference internal" href="#pyspark.ml.recommendation.ALS.numItemBlocks" title="pyspark.ml.recommendation.ALS.numItemBlocks"><code class="xref py py-attr docutils literal"><span class="pre">numItemBlocks</span></code></a> to the specific value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setNumItemBlocks">
<code class="descname">setNumItemBlocks</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setNumItemBlocks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setNumItemBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.numItemBlocks" title="pyspark.ml.recommendation.ALS.numItemBlocks"><code class="xref py py-attr docutils literal"><span class="pre">numItemBlocks</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setNumUserBlocks">
<code class="descname">setNumUserBlocks</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setNumUserBlocks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setNumUserBlocks" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.numUserBlocks" title="pyspark.ml.recommendation.ALS.numUserBlocks"><code class="xref py py-attr docutils literal"><span class="pre">numUserBlocks</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>rank=10</em>, <em>maxIter=10</em>, <em>regParam=0.1</em>, <em>numUserBlocks=10</em>, <em>numItemBlocks=10</em>, <em>implicitPrefs=False</em>, <em>alpha=1.0</em>, <em>userCol=&quot;user&quot;</em>, <em>itemCol=&quot;item&quot;</em>, <em>seed=None</em>, <em>ratingCol=&quot;rating&quot;</em>, <em>nonnegative=False</em>, <em>checkpointInterval=10</em>, <em>intermediateStorageLevel=&quot;MEMORY_AND_DISK&quot;</em>, <em>finalStorageLevel=&quot;MEMORY_AND_DISK&quot;</em>, <em>coldStartStrategy=&quot;nan&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for ALS.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.predictionCol" title="pyspark.ml.recommendation.ALS.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setRank">
<code class="descname">setRank</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setRank"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setRank" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.rank" title="pyspark.ml.recommendation.ALS.rank"><code class="xref py py-attr docutils literal"><span class="pre">rank</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setRatingCol">
<code class="descname">setRatingCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setRatingCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setRatingCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.ratingCol" title="pyspark.ml.recommendation.ALS.ratingCol"><code class="xref py py-attr docutils literal"><span class="pre">ratingCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setRegParam">
<code class="descname">setRegParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.regParam" title="pyspark.ml.recommendation.ALS.regParam"><code class="xref py py-attr docutils literal"><span class="pre">regParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.seed" title="pyspark.ml.recommendation.ALS.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.setUserCol">
<code class="descname">setUserCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALS.setUserCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALS.setUserCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.recommendation.ALS.userCol" title="pyspark.ml.recommendation.ALS.userCol"><code class="xref py py-attr docutils literal"><span class="pre">userCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALS.userCol">
<code class="descname">userCol</code><em class="property"> = Param(parent='undefined', name='userCol', doc='column name for user ids. Ids must be within the integer value range.')</em><a class="headerlink" href="#pyspark.ml.recommendation.ALS.userCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALS.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALS.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.recommendation.ALSModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.recommendation.</code><code class="descname">ALSModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALSModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by ALS.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALSModel.itemFactors">
<code class="descname">itemFactors</code><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.itemFactors" title="Permalink to this definition"></a></dt>
<dd><p>a DataFrame that stores item factors in two columns: <cite>id</cite> and
<cite>features</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALSModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALSModel.rank">
<code class="descname">rank</code><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.rank" title="Permalink to this definition"></a></dt>
<dd><p>rank of the matrix factorization model</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.recommendForAllItems">
<code class="descname">recommendForAllItems</code><span class="sig-paren">(</span><em>numUsers</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALSModel.recommendForAllItems"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.recommendForAllItems" title="Permalink to this definition"></a></dt>
<dd><p>Returns top <cite>numUsers</cite> users recommended for each item, for all items.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>numUsers</strong> – max number of recommendations for each item</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a DataFrame of (itemCol, recommendations), where recommendations are
stored as an array of (userCol, rating) Rows.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.recommendForAllUsers">
<code class="descname">recommendForAllUsers</code><span class="sig-paren">(</span><em>numItems</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/recommendation.html#ALSModel.recommendForAllUsers"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.recommendForAllUsers" title="Permalink to this definition"></a></dt>
<dd><p>Returns top <cite>numItems</cite> items recommended for each user, for all users.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>numItems</strong> – max number of recommendations for each user</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a DataFrame of (userCol, recommendations), where recommendations are
stored as an array of (itemCol, rating) Rows.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.recommendation.ALSModel.userFactors">
<code class="descname">userFactors</code><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.userFactors" title="Permalink to this definition"></a></dt>
<dd><p>a DataFrame that stores user factors in two columns: <cite>id</cite> and
<cite>features</cite></p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.recommendation.ALSModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.recommendation.ALSModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.regression">
<span id="pyspark-ml-regression-module"></span><h2>pyspark.ml.regression module<a class="headerlink" href="#module-pyspark.ml.regression" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.regression.AFTSurvivalRegression">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">AFTSurvivalRegression</code><span class="sig-paren">(</span><em>featuresCol='features', labelCol='label', predictionCol='prediction', fitIntercept=True, maxIter=100, tol=1e-06, censorCol='censor', quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], quantilesCol=None, aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Accelerated Failure Time (AFT) Model Survival Regression</p>
<p>Fit a parametric AFT survival regression model based on the Weibull distribution
of the survival time.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://en.wikipedia.org/wiki/Accelerated_failure_time_model">AFT Model</a></p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1e-40</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]),</span> <span class="mf">0.0</span><span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;censor&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">aftsr</span> <span class="o">=</span> <span class="n">AFTSurvivalRegression</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">aftsr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">6.3</span><span class="p">))</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predictQuantiles</span><span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">6.3</span><span class="p">))</span>
<span class="go">DenseVector([0.0101, 0.0513, 0.1054, 0.2877, 0.6931, 1.3863, 2.3026, 2.9957, 4.6052])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+-------+---------+------+----------+</span>
<span class="go">| label| features|censor|prediction|</span>
<span class="go">+-------+---------+------+----------+</span>
<span class="go">| 1.0| [1.0]| 1.0| 1.0|</span>
<span class="go">|1.0E-40|(1,[],[])| 0.0| 1.0|</span>
<span class="go">+-------+---------+------+----------+</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">aftsr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/aftsr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">aftsr</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">aftsr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">aftsr2</span> <span class="o">=</span> <span class="n">AFTSurvivalRegression</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">aftsr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">aftsr2</span><span class="o">.</span><span class="n">getMaxIter</span><span class="p">()</span>
<span class="go">100</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/aftsr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">AFTSurvivalRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">scale</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">scale</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.aggregationDepth">
<code class="descname">aggregationDepth</code><em class="property"> = Param(parent='undefined', name='aggregationDepth', doc='suggested depth for treeAggregate (&gt;= 2).')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.aggregationDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.censorCol">
<code class="descname">censorCol</code><em class="property"> = Param(parent='undefined', name='censorCol', doc='censor column name. The value of this column could be 0 or 1. If the value is 1, it means the event has occurred i.e. uncensored; otherwise censored.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.censorCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.fitIntercept">
<code class="descname">fitIntercept</code><em class="property"> = Param(parent='undefined', name='fitIntercept', doc='whether to fit an intercept term.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.fitIntercept" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getAggregationDepth">
<code class="descname">getAggregationDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of aggregationDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getCensorCol">
<code class="descname">getCensorCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.getCensorCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getCensorCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of censorCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getFitIntercept">
<code class="descname">getFitIntercept</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fitIntercept or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getQuantileProbabilities">
<code class="descname">getQuantileProbabilities</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.getQuantileProbabilities"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getQuantileProbabilities" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of quantileProbabilities or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getQuantilesCol">
<code class="descname">getQuantilesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.getQuantilesCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getQuantilesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of quantilesCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.quantileProbabilities">
<code class="descname">quantileProbabilities</code><em class="property"> = Param(parent='undefined', name='quantileProbabilities', doc='quantile probabilities array. Values of the quantile probabilities array should be in the range (0, 1) and the array should be non-empty.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.quantileProbabilities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.quantilesCol">
<code class="descname">quantilesCol</code><em class="property"> = Param(parent='undefined', name='quantilesCol', doc='quantiles column name. This column will output quantiles of corresponding quantileProbabilities if it is set.')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.quantilesCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setAggregationDepth">
<code class="descname">setAggregationDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.aggregationDepth" title="pyspark.ml.regression.AFTSurvivalRegression.aggregationDepth"><code class="xref py py-attr docutils literal"><span class="pre">aggregationDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setCensorCol">
<code class="descname">setCensorCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.setCensorCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setCensorCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.censorCol" title="pyspark.ml.regression.AFTSurvivalRegression.censorCol"><code class="xref py py-attr docutils literal"><span class="pre">censorCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.featuresCol" title="pyspark.ml.regression.AFTSurvivalRegression.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setFitIntercept">
<code class="descname">setFitIntercept</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.fitIntercept" title="pyspark.ml.regression.AFTSurvivalRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">fitIntercept</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.labelCol" title="pyspark.ml.regression.AFTSurvivalRegression.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.maxIter" title="pyspark.ml.regression.AFTSurvivalRegression.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>featuresCol='features', labelCol='label', predictionCol='prediction', fitIntercept=True, maxIter=100, tol=1e-06, censorCol='censor', quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], quantilesCol=None, aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, featuresCol=”features”, labelCol=”label”, predictionCol=”prediction”, fitIntercept=True, maxIter=100, tol=1E-6, censorCol=”censor”, quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], quantilesCol=None, aggregationDepth=2):</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.predictionCol" title="pyspark.ml.regression.AFTSurvivalRegression.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setQuantileProbabilities">
<code class="descname">setQuantileProbabilities</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.setQuantileProbabilities"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setQuantileProbabilities" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.quantileProbabilities" title="pyspark.ml.regression.AFTSurvivalRegression.quantileProbabilities"><code class="xref py py-attr docutils literal"><span class="pre">quantileProbabilities</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setQuantilesCol">
<code class="descname">setQuantilesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegression.setQuantilesCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setQuantilesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.quantilesCol" title="pyspark.ml.regression.AFTSurvivalRegression.quantilesCol"><code class="xref py py-attr docutils literal"><span class="pre">quantilesCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression.tol" title="pyspark.ml.regression.AFTSurvivalRegression.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegression.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegression.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">AFTSurvivalRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.AFTSurvivalRegression" title="pyspark.ml.regression.AFTSurvivalRegression"><code class="xref py py-class docutils literal"><span class="pre">AFTSurvivalRegression</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.coefficients">
<code class="descname">coefficients</code><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.coefficients" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.predict">
<code class="descname">predict</code><span class="sig-paren">(</span><em>features</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegressionModel.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.predict" title="Permalink to this definition"></a></dt>
<dd><p>Predicted value</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.predictQuantiles">
<code class="descname">predictQuantiles</code><span class="sig-paren">(</span><em>features</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#AFTSurvivalRegressionModel.predictQuantiles"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.predictQuantiles" title="Permalink to this definition"></a></dt>
<dd><p>Predicted Quantiles</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.scale">
<code class="descname">scale</code><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.scale" title="Permalink to this definition"></a></dt>
<dd><p>Model scale paramter.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.AFTSurvivalRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.AFTSurvivalRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.DecisionTreeRegressor">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">DecisionTreeRegressor</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity='variance'</em>, <em>seed=None</em>, <em>varianceCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#DecisionTreeRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a>
learning algorithm for regression.
It supports both continuous and categorical features.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt</span> <span class="o">=</span> <span class="n">DecisionTreeRegressor</span><span class="p">(</span><span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">varianceCol</span><span class="o">=</span><span class="s2">&quot;variance&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">dt</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">depth</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numNodes</span>
<span class="go">3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dtr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/dtr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">dtr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt2</span> <span class="o">=</span> <span class="n">DecisionTreeRegressor</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">dtr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dt2</span><span class="o">.</span><span class="n">getMaxDepth</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/dtr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">DecisionTreeRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numNodes</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">numNodes</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">depth</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">depth</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">variance</span>
<span class="go">0.0</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getImpurity">
<code class="descname">getImpurity</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of impurity or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.getVarianceCol">
<code class="descname">getVarianceCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.getVarianceCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of varianceCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.impurity">
<code class="descname">impurity</code><em class="property"> = Param(parent='undefined', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: variance')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.impurity" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.cacheNodeIds" title="pyspark.ml.regression.DecisionTreeRegressor.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.checkpointInterval" title="pyspark.ml.regression.DecisionTreeRegressor.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.featuresCol" title="pyspark.ml.regression.DecisionTreeRegressor.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setImpurity">
<code class="descname">setImpurity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.impurity" title="pyspark.ml.regression.DecisionTreeRegressor.impurity"><code class="xref py py-attr docutils literal"><span class="pre">impurity</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.labelCol" title="pyspark.ml.regression.DecisionTreeRegressor.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.maxBins" title="pyspark.ml.regression.DecisionTreeRegressor.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.maxDepth" title="pyspark.ml.regression.DecisionTreeRegressor.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.maxMemoryInMB" title="pyspark.ml.regression.DecisionTreeRegressor.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.minInfoGain" title="pyspark.ml.regression.DecisionTreeRegressor.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.minInstancesPerNode" title="pyspark.ml.regression.DecisionTreeRegressor.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity=&quot;variance&quot;</em>, <em>seed=None</em>, <em>varianceCol=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#DecisionTreeRegressor.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for the DecisionTreeRegressor.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.predictionCol" title="pyspark.ml.regression.DecisionTreeRegressor.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.seed" title="pyspark.ml.regression.DecisionTreeRegressor.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.setVarianceCol">
<code class="descname">setVarianceCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.setVarianceCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor.varianceCol" title="pyspark.ml.regression.DecisionTreeRegressor.varianceCol"><code class="xref py py-attr docutils literal"><span class="pre">varianceCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.supportedImpurities">
<code class="descname">supportedImpurities</code><em class="property"> = ['variance']</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.supportedImpurities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.varianceCol">
<code class="descname">varianceCol</code><em class="property"> = Param(parent='undefined', name='varianceCol', doc='column name for the biased sample variance of prediction.')</em><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.varianceCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressor.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressor.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">DecisionTreeRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#DecisionTreeRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressor" title="pyspark.ml.regression.DecisionTreeRegressor"><code class="xref py py-class docutils literal"><span class="pre">DecisionTreeRegressor</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.depth">
<code class="descname">depth</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.depth" title="Permalink to this definition"></a></dt>
<dd><p>Return depth of the decision tree.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>This generalizes the idea of “Gini” importance to other losses,
following the explanation of Gini importance from “Random Forests” documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.</p>
<dl class="docutils">
<dt>This feature importance is calculated as follows:</dt>
<dd><ul class="first last simple">
<li>importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node</li>
<li>Normalize importances for tree to sum to 1.</li>
</ul>
</dd>
</dl>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Feature importance for single decision trees can have high variance due to
correlated predictor variables. Consider using a <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor" title="pyspark.ml.regression.RandomForestRegressor"><code class="xref py py-class docutils literal"><span class="pre">RandomForestRegressor</span></code></a>
to determine feature importance instead.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.numNodes">
<code class="descname">numNodes</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.numNodes" title="Permalink to this definition"></a></dt>
<dd><p>Return number of nodes of the decision tree.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.DecisionTreeRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.DecisionTreeRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GBTRegressor">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GBTRegressor</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>subsamplingRate=1.0</em>, <em>checkpointInterval=10</em>, <em>lossType='squared'</em>, <em>maxIter=20</em>, <em>stepSize=0.1</em>, <em>seed=None</em>, <em>impurity='variance'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GBTRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
learning algorithm for regression.
It supports both continuous and categorical features.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy</span> <span class="k">import</span> <span class="n">allclose</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt</span> <span class="o">=</span> <span class="n">GBTRegressor</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="n">gbt</span><span class="o">.</span><span class="n">getImpurity</span><span class="p">())</span>
<span class="go">variance</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">gbt</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">allclose</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">1.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbtr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;gbtr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">gbtr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt2</span> <span class="o">=</span> <span class="n">GBTRegressor</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">gbtr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gbt2</span><span class="o">.</span><span class="n">getMaxDepth</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;gbtr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">GBTRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">treeWeights</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">trees</span>
<span class="go">[DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getImpurity">
<code class="descname">getImpurity</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of impurity or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getLossType">
<code class="descname">getLossType</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GBTRegressor.getLossType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getLossType" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of lossType or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getStepSize">
<code class="descname">getStepSize</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of stepSize or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.getSubsamplingRate">
<code class="descname">getSubsamplingRate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.getSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of subsamplingRate or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.impurity">
<code class="descname">impurity</code><em class="property"> = Param(parent='undefined', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: variance')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.impurity" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.lossType">
<code class="descname">lossType</code><em class="property"> = Param(parent='undefined', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: squared, absolute')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.lossType" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.cacheNodeIds" title="pyspark.ml.regression.GBTRegressor.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.checkpointInterval" title="pyspark.ml.regression.GBTRegressor.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.featuresCol" title="pyspark.ml.regression.GBTRegressor.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setImpurity">
<code class="descname">setImpurity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.impurity" title="pyspark.ml.regression.GBTRegressor.impurity"><code class="xref py py-attr docutils literal"><span class="pre">impurity</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.labelCol" title="pyspark.ml.regression.GBTRegressor.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setLossType">
<code class="descname">setLossType</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GBTRegressor.setLossType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setLossType" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.lossType" title="pyspark.ml.regression.GBTRegressor.lossType"><code class="xref py py-attr docutils literal"><span class="pre">lossType</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.maxBins" title="pyspark.ml.regression.GBTRegressor.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.maxDepth" title="pyspark.ml.regression.GBTRegressor.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.maxIter" title="pyspark.ml.regression.GBTRegressor.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.maxMemoryInMB" title="pyspark.ml.regression.GBTRegressor.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.minInfoGain" title="pyspark.ml.regression.GBTRegressor.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.minInstancesPerNode" title="pyspark.ml.regression.GBTRegressor.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>subsamplingRate=1.0</em>, <em>checkpointInterval=10</em>, <em>lossType=&quot;squared&quot;</em>, <em>maxIter=20</em>, <em>stepSize=0.1</em>, <em>seed=None</em>, <em>impurity=&quot;variance&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GBTRegressor.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for Gradient Boosted Tree Regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.predictionCol" title="pyspark.ml.regression.GBTRegressor.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.seed" title="pyspark.ml.regression.GBTRegressor.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setStepSize">
<code class="descname">setStepSize</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setStepSize" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.stepSize" title="pyspark.ml.regression.GBTRegressor.stepSize"><code class="xref py py-attr docutils literal"><span class="pre">stepSize</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.setSubsamplingRate">
<code class="descname">setSubsamplingRate</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.setSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor.subsamplingRate" title="pyspark.ml.regression.GBTRegressor.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.stepSize">
<code class="descname">stepSize</code><em class="property"> = Param(parent='undefined', name='stepSize', doc='Step size to be used for each iteration of optimization (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.stepSize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.subsamplingRate">
<code class="descname">subsamplingRate</code><em class="property"> = Param(parent='undefined', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].')</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.subsamplingRate" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.supportedImpurities">
<code class="descname">supportedImpurities</code><em class="property"> = ['variance']</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.supportedImpurities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressor.supportedLossTypes">
<code class="descname">supportedLossTypes</code><em class="property"> = ['squared', 'absolute']</em><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.supportedLossTypes" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressor.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressor.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GBTRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GBTRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GBTRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.GBTRegressor" title="pyspark.ml.regression.GBTRegressor"><code class="xref py py-class docutils literal"><span class="pre">GBTRegressor</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>Each feature’s importance is the average of its importance across all trees in the ensemble
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
(Hastie, Tibshirani, Friedman. “The Elements of Statistical Learning, 2nd Edition.” 2001.)
and follows the implementation from scikit-learn.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances" title="pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances"><code class="xref py py-attr docutils literal"><span class="pre">DecisionTreeRegressionModel.featureImportances</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.getNumTrees">
<code class="descname">getNumTrees</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.totalNumNodes">
<code class="descname">totalNumNodes</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.treeWeights">
<code class="descname">treeWeights</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.treeWeights" title="Permalink to this definition"></a></dt>
<dd><p>Return the weights for each tree</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GBTRegressionModel.trees">
<code class="descname">trees</code><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.trees" title="Permalink to this definition"></a></dt>
<dd><p>Trees in this ensemble. Warning: These have null parent Estimators.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GBTRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GBTRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GeneralizedLinearRegression</code><span class="sig-paren">(</span><em>labelCol='label'</em>, <em>featuresCol='features'</em>, <em>predictionCol='prediction'</em>, <em>family='gaussian'</em>, <em>link=None</em>, <em>fitIntercept=True</em>, <em>maxIter=25</em>, <em>tol=1e-06</em>, <em>regParam=0.0</em>, <em>weightCol=None</em>, <em>solver='irls'</em>, <em>linkPredictionCol=None</em>, <em>variancePower=0.0</em>, <em>linkPower=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Generalized Linear Regression.</p>
<p>Fit a Generalized Linear Model specified by giving a symbolic description of the linear
predictor (link function) and a description of the error distribution (family). It supports
“gaussian”, “binomial”, “poisson”, “gamma” and “tweedie” as family. Valid link functions for
each family is listed below. The first link function of each family is the default one.</p>
<ul class="simple">
<li>“gaussian” -&gt; “identity”, “log”, “inverse”</li>
<li>“binomial” -&gt; “logit”, “probit”, “cloglog”</li>
<li>“poisson” -&gt; “log”, “identity”, “sqrt”</li>
<li>“gamma” -&gt; “inverse”, “identity”, “log”</li>
<li>“tweedie” -&gt; power link function specified through “linkPower”. The default link power in the tweedie family is 1 - variancePower.</li>
</ul>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="https://en.wikipedia.org/wiki/Generalized_linear_model">GLM</a></p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)),],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">glr</span> <span class="o">=</span> <span class="n">GeneralizedLinearRegression</span><span class="p">(</span><span class="n">family</span><span class="o">=</span><span class="s2">&quot;gaussian&quot;</span><span class="p">,</span> <span class="n">link</span><span class="o">=</span><span class="s2">&quot;identity&quot;</span><span class="p">,</span> <span class="n">linkPredictionCol</span><span class="o">=</span><span class="s2">&quot;p&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">glr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">transformed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">transformed</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span> <span class="o">-</span> <span class="mf">1.5</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">transformed</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">p</span> <span class="o">-</span> <span class="mf">1.5</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span>
<span class="go">DenseVector([1.5..., -1.0...])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">-</span> <span class="mf">1.5</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">glr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/glr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">glr</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">glr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">glr2</span> <span class="o">=</span> <span class="n">GeneralizedLinearRegression</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">glr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">glr</span><span class="o">.</span><span class="n">getFamily</span><span class="p">()</span> <span class="o">==</span> <span class="n">glr2</span><span class="o">.</span><span class="n">getFamily</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/glr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">GeneralizedLinearRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.family">
<code class="descname">family</code><em class="property"> = Param(parent='undefined', name='family', doc='The name of family which is a description of the error distribution to be used in the model. Supported options: gaussian (default), binomial, poisson, gamma and tweedie.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.family" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept">
<code class="descname">fitIntercept</code><em class="property"> = Param(parent='undefined', name='fitIntercept', doc='whether to fit an intercept term.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getFamily">
<code class="descname">getFamily</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.getFamily"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getFamily" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of family or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getFitIntercept">
<code class="descname">getFitIntercept</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fitIntercept or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getLink">
<code class="descname">getLink</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.getLink"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getLink" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of link or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getLinkPower">
<code class="descname">getLinkPower</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.getLinkPower"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getLinkPower" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of linkPower or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getLinkPredictionCol">
<code class="descname">getLinkPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.getLinkPredictionCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getLinkPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of linkPredictionCol or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getRegParam">
<code class="descname">getRegParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of regParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getSolver">
<code class="descname">getSolver</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getSolver" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of solver or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getVariancePower">
<code class="descname">getVariancePower</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.getVariancePower"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getVariancePower" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of variancePower or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.link">
<code class="descname">link</code><em class="property"> = Param(parent='undefined', name='link', doc='The name of link function which provides the relationship between the linear predictor and the mean of the distribution function. Supported options: identity, log, inverse, logit, probit, cloglog and sqrt.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.link" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.linkPower">
<code class="descname">linkPower</code><em class="property"> = Param(parent='undefined', name='linkPower', doc='The index in the power link function. Only applicable to the Tweedie family.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.linkPower" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.linkPredictionCol">
<code class="descname">linkPredictionCol</code><em class="property"> = Param(parent='undefined', name='linkPredictionCol', doc='link prediction (linear predictor) column name')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.linkPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.regParam">
<code class="descname">regParam</code><em class="property"> = Param(parent='undefined', name='regParam', doc='regularization parameter (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.regParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setFamily">
<code class="descname">setFamily</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setFamily"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setFamily" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.family" title="pyspark.ml.regression.GeneralizedLinearRegression.family"><code class="xref py py-attr docutils literal"><span class="pre">family</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.featuresCol" title="pyspark.ml.regression.GeneralizedLinearRegression.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setFitIntercept">
<code class="descname">setFitIntercept</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept" title="pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">fitIntercept</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.labelCol" title="pyspark.ml.regression.GeneralizedLinearRegression.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setLink">
<code class="descname">setLink</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setLink"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setLink" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.link" title="pyspark.ml.regression.GeneralizedLinearRegression.link"><code class="xref py py-attr docutils literal"><span class="pre">link</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setLinkPower">
<code class="descname">setLinkPower</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setLinkPower"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setLinkPower" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.linkPower" title="pyspark.ml.regression.GeneralizedLinearRegression.linkPower"><code class="xref py py-attr docutils literal"><span class="pre">linkPower</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setLinkPredictionCol">
<code class="descname">setLinkPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setLinkPredictionCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setLinkPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.linkPredictionCol" title="pyspark.ml.regression.GeneralizedLinearRegression.linkPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">linkPredictionCol</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.maxIter" title="pyspark.ml.regression.GeneralizedLinearRegression.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>labelCol=&quot;label&quot;</em>, <em>featuresCol=&quot;features&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>family=&quot;gaussian&quot;</em>, <em>link=None</em>, <em>fitIntercept=True</em>, <em>maxIter=25</em>, <em>tol=1e-6</em>, <em>regParam=0.0</em>, <em>weightCol=None</em>, <em>solver=&quot;irls&quot;</em>, <em>linkPredictionCol=None</em>, <em>variancePower=0.0</em>, <em>linkPower=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for generalized linear regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.predictionCol" title="pyspark.ml.regression.GeneralizedLinearRegression.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setRegParam">
<code class="descname">setRegParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.regParam" title="pyspark.ml.regression.GeneralizedLinearRegression.regParam"><code class="xref py py-attr docutils literal"><span class="pre">regParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setSolver">
<code class="descname">setSolver</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setSolver" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.solver" title="pyspark.ml.regression.GeneralizedLinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">solver</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.tol" title="pyspark.ml.regression.GeneralizedLinearRegression.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setVariancePower">
<code class="descname">setVariancePower</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegression.setVariancePower"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setVariancePower" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.variancePower" title="pyspark.ml.regression.GeneralizedLinearRegression.variancePower"><code class="xref py py-attr docutils literal"><span class="pre">variancePower</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.weightCol" title="pyspark.ml.regression.GeneralizedLinearRegression.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.solver">
<code class="descname">solver</code><em class="property"> = Param(parent='undefined', name='solver', doc=&quot;the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.&quot;)</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.solver" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.variancePower">
<code class="descname">variancePower</code><em class="property"> = Param(parent='undefined', name='variancePower', doc='The power in the variance function of the Tweedie distribution which characterizes the relationship between the variance and mean of the distribution. Only applicable for the Tweedie family. Supported values: 0 and [1, Inf).')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.variancePower" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegression.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegression.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GeneralizedLinearRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression" title="pyspark.ml.regression.GeneralizedLinearRegression"><code class="xref py py-class docutils literal"><span class="pre">GeneralizedLinearRegression</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.coefficients">
<code class="descname">coefficients</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.coefficients" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegressionModel.evaluate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the model on a test dataset.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dataset</strong> – Test dataset to evaluate model on, where dataset is an
instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. residuals, deviance, pValues) of model on
training set. An exception is thrown if
<cite>trainingSummary is None</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GeneralizedLinearRegressionSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegressionSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Generalized linear regression results evaluated on a dataset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.aic">
<code class="descname">aic</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.aic" title="Permalink to this definition"></a></dt>
<dd><p>Akaike’s “An Information Criterion”(AIC) for the fitted model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.degreesOfFreedom">
<code class="descname">degreesOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.degreesOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>Degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.deviance">
<code class="descname">deviance</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.deviance" title="Permalink to this definition"></a></dt>
<dd><p>The deviance for the fitted model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.dispersion">
<code class="descname">dispersion</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.dispersion" title="Permalink to this definition"></a></dt>
<dd><p>The dispersion of the fitted model.
It is taken as 1.0 for the “binomial” and “poisson” families, and otherwise
estimated by the residual Pearson’s Chi-Squared statistic (which is defined as
sum of the squares of the Pearson residuals) divided by the residual degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.nullDeviance">
<code class="descname">nullDeviance</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.nullDeviance" title="Permalink to this definition"></a></dt>
<dd><p>The deviance for the null model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.numInstances">
<code class="descname">numInstances</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.numInstances" title="Permalink to this definition"></a></dt>
<dd><p>Number of instances in DataFrame predictions.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictions" title="pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictions"><code class="xref py py-attr docutils literal"><span class="pre">predictions</span></code></a> which gives the predicted value of each instance.
This is set to a new column name if the original model’s <cite>predictionCol</cite> is not set.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Predictions output by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.rank">
<code class="descname">rank</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.rank" title="Permalink to this definition"></a></dt>
<dd><p>The numeric rank of the fitted linear model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.residualDegreeOfFreedom">
<code class="descname">residualDegreeOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.residualDegreeOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>The residual degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.residualDegreeOfFreedomNull">
<code class="descname">residualDegreeOfFreedomNull</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.residualDegreeOfFreedomNull" title="Permalink to this definition"></a></dt>
<dd><p>The residual degrees of freedom for the null model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionSummary.residuals">
<code class="descname">residuals</code><span class="sig-paren">(</span><em>residualsType='deviance'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegressionSummary.residuals"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionSummary.residuals" title="Permalink to this definition"></a></dt>
<dd><p>Get the residuals of the fitted model by type.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>residualsType</strong> – The type of residuals which should be returned.
Supported options: deviance (default), pearson, working, and response.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">GeneralizedLinearRegressionTrainingSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#GeneralizedLinearRegressionTrainingSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Generalized linear regression training results.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.aic">
<code class="descname">aic</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.aic" title="Permalink to this definition"></a></dt>
<dd><p>Akaike’s “An Information Criterion”(AIC) for the fitted model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.coefficientStandardErrors">
<code class="descname">coefficientStandardErrors</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.coefficientStandardErrors" title="Permalink to this definition"></a></dt>
<dd><p>Standard error of estimated coefficients and intercept.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept" title="pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">GeneralizedLinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.degreesOfFreedom">
<code class="descname">degreesOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.degreesOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>Degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.deviance">
<code class="descname">deviance</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.deviance" title="Permalink to this definition"></a></dt>
<dd><p>The deviance for the fitted model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.dispersion">
<code class="descname">dispersion</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.dispersion" title="Permalink to this definition"></a></dt>
<dd><p>The dispersion of the fitted model.
It is taken as 1.0 for the “binomial” and “poisson” families, and otherwise
estimated by the residual Pearson’s Chi-Squared statistic (which is defined as
sum of the squares of the Pearson residuals) divided by the residual degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.nullDeviance">
<code class="descname">nullDeviance</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.nullDeviance" title="Permalink to this definition"></a></dt>
<dd><p>The deviance for the null model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.numInstances">
<code class="descname">numInstances</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.numInstances" title="Permalink to this definition"></a></dt>
<dd><p>Number of instances in DataFrame predictions.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.numIterations">
<code class="descname">numIterations</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.numIterations" title="Permalink to this definition"></a></dt>
<dd><p>Number of training iterations.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.pValues">
<code class="descname">pValues</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.pValues" title="Permalink to this definition"></a></dt>
<dd><p>Two-sided p-value of estimated coefficients and intercept.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept" title="pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">GeneralizedLinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictions" title="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictions"><code class="xref py py-attr docutils literal"><span class="pre">predictions</span></code></a> which gives the predicted value of each instance.
This is set to a new column name if the original model’s <cite>predictionCol</cite> is not set.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Predictions output by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.rank">
<code class="descname">rank</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.rank" title="Permalink to this definition"></a></dt>
<dd><p>The numeric rank of the fitted linear model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residualDegreeOfFreedom">
<code class="descname">residualDegreeOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residualDegreeOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>The residual degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residualDegreeOfFreedomNull">
<code class="descname">residualDegreeOfFreedomNull</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residualDegreeOfFreedomNull" title="Permalink to this definition"></a></dt>
<dd><p>The residual degrees of freedom for the null model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residuals">
<code class="descname">residuals</code><span class="sig-paren">(</span><em>residualsType='deviance'</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.residuals" title="Permalink to this definition"></a></dt>
<dd><p>Get the residuals of the fitted model by type.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>residualsType</strong> – The type of residuals which should be returned.
Supported options: deviance (default), pearson, working, and response.</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.solver">
<code class="descname">solver</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.solver" title="Permalink to this definition"></a></dt>
<dd><p>The numeric solver used for training.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.tValues">
<code class="descname">tValues</code><a class="headerlink" href="#pyspark.ml.regression.GeneralizedLinearRegressionTrainingSummary.tValues" title="Permalink to this definition"></a></dt>
<dd><p>T-statistic of estimated coefficients and intercept.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept" title="pyspark.ml.regression.GeneralizedLinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">GeneralizedLinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.IsotonicRegression">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">IsotonicRegression</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>weightCol=None</em>, <em>isotonic=True</em>, <em>featureIndex=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression" title="Permalink to this definition"></a></dt>
<dd><p>Currently implemented using parallelized pool adjacent violators algorithm.
Only univariate (single feature) algorithm supported.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ir</span> <span class="o">=</span> <span class="n">IsotonicRegression</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">ir</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">boundaries</span>
<span class="go">DenseVector([0.0, 1.0])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ir_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/ir&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ir</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">ir_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ir2</span> <span class="o">=</span> <span class="n">IsotonicRegression</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">ir_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ir2</span><span class="o">.</span><span class="n">getIsotonic</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/ir_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">IsotonicRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">boundaries</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">boundaries</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">predictions</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">predictions</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.featureIndex">
<code class="descname">featureIndex</code><em class="property"> = Param(parent='undefined', name='featureIndex', doc='The index of the feature if featuresCol is a vector column, no effect otherwise.')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.featureIndex" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getFeatureIndex">
<code class="descname">getFeatureIndex</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression.getFeatureIndex"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getFeatureIndex" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featureIndex or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getIsotonic">
<code class="descname">getIsotonic</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression.getIsotonic"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getIsotonic" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of isotonic or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.isotonic">
<code class="descname">isotonic</code><em class="property"> = Param(parent='undefined', name='isotonic', doc='whether the output sequence should be isotonic/increasing (true) orantitonic/decreasing (false).')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.isotonic" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setFeatureIndex">
<code class="descname">setFeatureIndex</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression.setFeatureIndex"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setFeatureIndex" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.featureIndex" title="pyspark.ml.regression.IsotonicRegression.featureIndex"><code class="xref py py-attr docutils literal"><span class="pre">featureIndex</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.featuresCol" title="pyspark.ml.regression.IsotonicRegression.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setIsotonic">
<code class="descname">setIsotonic</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression.setIsotonic"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setIsotonic" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.isotonic" title="pyspark.ml.regression.IsotonicRegression.isotonic"><code class="xref py py-attr docutils literal"><span class="pre">isotonic</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.labelCol" title="pyspark.ml.regression.IsotonicRegression.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>weightCol=None</em>, <em>isotonic=True</em>, <em>featureIndex=0</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegression.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, featuresCol=”features”, labelCol=”label”, predictionCol=”prediction”, weightCol=None, isotonic=True, featureIndex=0):
Set the params for IsotonicRegression.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.predictionCol" title="pyspark.ml.regression.IsotonicRegression.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression.weightCol" title="pyspark.ml.regression.IsotonicRegression.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegression.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegression.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegression.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.IsotonicRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">IsotonicRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#IsotonicRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.IsotonicRegression" title="pyspark.ml.regression.IsotonicRegression"><code class="xref py py-class docutils literal"><span class="pre">IsotonicRegression</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.boundaries">
<code class="descname">boundaries</code><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.boundaries" title="Permalink to this definition"></a></dt>
<dd><p>Boundaries in increasing order for which predictions are known.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Predictions associated with the boundaries at the same index, monotone because of isotonic
regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.6.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.IsotonicRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.IsotonicRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.LinearRegression">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">LinearRegression</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>elasticNetParam=0.0</em>, <em>tol=1e-06</em>, <em>fitIntercept=True</em>, <em>standardization=True</em>, <em>solver='auto'</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegression"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegression" title="Permalink to this definition"></a></dt>
<dd><p>Linear regression.</p>
<p>The learning objective is to minimize the squared error, with regularization.
The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^</p>
<p>This supports multiple types of regularization:</p>
<blockquote>
<div><ul class="simple">
<li>none (a.k.a. ordinary least squares)</li>
<li>L2 (ridge regression)</li>
<li>L1 (Lasso)</li>
<li>L2 + L1 (elastic net)</li>
</ul>
</div></blockquote>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;weight&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LinearRegression</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">regParam</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">solver</span><span class="o">=</span><span class="s2">&quot;normal&quot;</span><span class="p">,</span> <span class="n">weightCol</span><span class="o">=</span><span class="s2">&quot;weight&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span> <span class="o">-</span> <span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">))</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">-</span> <span class="mf">0.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">abs</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span> <span class="o">-</span> <span class="mf">1.0</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mf">0.001</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="s2">&quot;vector&quot;</span><span class="p">)</span>
<span class="gt">Traceback (most recent call last):</span>
<span class="o">...</span>
<span class="gr">TypeError</span>: <span class="n">Method setParams forces keyword arguments.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">lr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr2</span> <span class="o">=</span> <span class="n">LinearRegression</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">lr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr2</span><span class="o">.</span><span class="n">getMaxIter</span><span class="p">()</span>
<span class="go">5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/lr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">LinearRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">coefficients</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">intercept</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">intercept</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">1</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.aggregationDepth">
<code class="descname">aggregationDepth</code><em class="property"> = Param(parent='undefined', name='aggregationDepth', doc='suggested depth for treeAggregate (&gt;= 2).')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.aggregationDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.elasticNetParam">
<code class="descname">elasticNetParam</code><em class="property"> = Param(parent='undefined', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.elasticNetParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.fitIntercept">
<code class="descname">fitIntercept</code><em class="property"> = Param(parent='undefined', name='fitIntercept', doc='whether to fit an intercept term.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getAggregationDepth">
<code class="descname">getAggregationDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of aggregationDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getElasticNetParam">
<code class="descname">getElasticNetParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getElasticNetParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of elasticNetParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getFitIntercept">
<code class="descname">getFitIntercept</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of fitIntercept or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getMaxIter">
<code class="descname">getMaxIter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxIter or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getRegParam">
<code class="descname">getRegParam</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of regParam or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getSolver">
<code class="descname">getSolver</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getSolver" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of solver or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getStandardization">
<code class="descname">getStandardization</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of standardization or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getTol">
<code class="descname">getTol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getTol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of tol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.getWeightCol">
<code class="descname">getWeightCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.getWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of weightCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.maxIter">
<code class="descname">maxIter</code><em class="property"> = Param(parent='undefined', name='maxIter', doc='max number of iterations (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.maxIter" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.regParam">
<code class="descname">regParam</code><em class="property"> = Param(parent='undefined', name='regParam', doc='regularization parameter (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.regParam" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setAggregationDepth">
<code class="descname">setAggregationDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setAggregationDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.aggregationDepth" title="pyspark.ml.regression.LinearRegression.aggregationDepth"><code class="xref py py-attr docutils literal"><span class="pre">aggregationDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setElasticNetParam">
<code class="descname">setElasticNetParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setElasticNetParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.elasticNetParam" title="pyspark.ml.regression.LinearRegression.elasticNetParam"><code class="xref py py-attr docutils literal"><span class="pre">elasticNetParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.featuresCol" title="pyspark.ml.regression.LinearRegression.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setFitIntercept">
<code class="descname">setFitIntercept</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setFitIntercept" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">fitIntercept</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.labelCol" title="pyspark.ml.regression.LinearRegression.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setMaxIter">
<code class="descname">setMaxIter</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setMaxIter" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.maxIter" title="pyspark.ml.regression.LinearRegression.maxIter"><code class="xref py py-attr docutils literal"><span class="pre">maxIter</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxIter=100</em>, <em>regParam=0.0</em>, <em>elasticNetParam=0.0</em>, <em>tol=1e-6</em>, <em>fitIntercept=True</em>, <em>standardization=True</em>, <em>solver=&quot;auto&quot;</em>, <em>weightCol=None</em>, <em>aggregationDepth=2</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegression.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for linear regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.predictionCol" title="pyspark.ml.regression.LinearRegression.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setRegParam">
<code class="descname">setRegParam</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setRegParam" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.regParam" title="pyspark.ml.regression.LinearRegression.regParam"><code class="xref py py-attr docutils literal"><span class="pre">regParam</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setSolver">
<code class="descname">setSolver</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setSolver" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">solver</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setStandardization">
<code class="descname">setStandardization</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setStandardization" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.standardization" title="pyspark.ml.regression.LinearRegression.standardization"><code class="xref py py-attr docutils literal"><span class="pre">standardization</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setTol">
<code class="descname">setTol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setTol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.tol" title="pyspark.ml.regression.LinearRegression.tol"><code class="xref py py-attr docutils literal"><span class="pre">tol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.setWeightCol">
<code class="descname">setWeightCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.setWeightCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.weightCol" title="pyspark.ml.regression.LinearRegression.weightCol"><code class="xref py py-attr docutils literal"><span class="pre">weightCol</span></code></a>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.solver">
<code class="descname">solver</code><em class="property"> = Param(parent='undefined', name='solver', doc=&quot;the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.&quot;)</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.solver" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.standardization">
<code class="descname">standardization</code><em class="property"> = Param(parent='undefined', name='standardization', doc='whether to standardize the training features before fitting the model.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.standardization" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.tol">
<code class="descname">tol</code><em class="property"> = Param(parent='undefined', name='tol', doc='the convergence tolerance for iterative algorithms (&gt;= 0).')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.tol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegression.weightCol">
<code class="descname">weightCol</code><em class="property"> = Param(parent='undefined', name='weightCol', doc='weight column name. If this is not set or empty, we treat all instance weights as 1.0.')</em><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.weightCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegression.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegression.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.LinearRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">LinearRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.LinearRegression" title="pyspark.ml.regression.LinearRegression"><code class="xref py py-class docutils literal"><span class="pre">LinearRegression</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.coefficients">
<code class="descname">coefficients</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.coefficients" title="Permalink to this definition"></a></dt>
<dd><p>Model coefficients.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegressionModel.evaluate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the model on a test dataset.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dataset</strong> – Test dataset to evaluate model on, where dataset is an
instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.hasSummary">
<code class="descname">hasSummary</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.hasSummary" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether a training summary exists for this model
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.intercept">
<code class="descname">intercept</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.intercept" title="Permalink to this definition"></a></dt>
<dd><p>Model intercept.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionModel.summary">
<code class="descname">summary</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.summary" title="Permalink to this definition"></a></dt>
<dd><p>Gets summary (e.g. residuals, mse, r-squared ) of model on
training set. An exception is thrown if
<cite>trainingSummary is None</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.LinearRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.LinearRegressionSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">LinearRegressionSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegressionSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Linear regression results evaluated on a dataset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.coefficientStandardErrors">
<code class="descname">coefficientStandardErrors</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.coefficientStandardErrors" title="Permalink to this definition"></a></dt>
<dd><p>Standard error of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.degreesOfFreedom">
<code class="descname">degreesOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.degreesOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>Degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.devianceResiduals">
<code class="descname">devianceResiduals</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.devianceResiduals" title="Permalink to this definition"></a></dt>
<dd><p>The weighted residuals, the usual residuals rescaled by the
square root of the instance weights.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.explainedVariance">
<code class="descname">explainedVariance</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.explainedVariance" title="Permalink to this definition"></a></dt>
<dd><p>Returns the explained variance regression score.
explainedVariance = 1 - variance(y - hat{y}) / variance(y)</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="http://en.wikipedia.org/wiki/Explained_variation">Wikipedia explain variation</a></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.meanAbsoluteError">
<code class="descname">meanAbsoluteError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.meanAbsoluteError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean absolute error, which is a risk function
corresponding to the expected value of the absolute error
loss or l1-norm loss.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.meanSquaredError">
<code class="descname">meanSquaredError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.meanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean squared error, which is a risk function
corresponding to the expected value of the squared error
loss or quadratic loss.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.numInstances">
<code class="descname">numInstances</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.numInstances" title="Permalink to this definition"></a></dt>
<dd><p>Number of instances in DataFrame predictions</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.pValues">
<code class="descname">pValues</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.pValues" title="Permalink to this definition"></a></dt>
<dd><p>Two-sided p-value of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the predicted value of
the label at each instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.r2">
<code class="descname">r2</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.r2" title="Permalink to this definition"></a></dt>
<dd><p>Returns R^2^, the coefficient of determination.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><cite>Wikipedia coefficient of determination &lt;http://en.wikipedia.org/wiki/Coefficient_of_determination&gt;</cite></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.residuals">
<code class="descname">residuals</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.residuals" title="Permalink to this definition"></a></dt>
<dd><p>Residuals (label - predicted value)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.rootMeanSquaredError">
<code class="descname">rootMeanSquaredError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.rootMeanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the root mean squared error, which is defined as the
square root of the mean squared error.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionSummary.tValues">
<code class="descname">tValues</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionSummary.tValues" title="Permalink to this definition"></a></dt>
<dd><p>T-statistic of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">LinearRegressionTrainingSummary</code><span class="sig-paren">(</span><em>java_obj=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#LinearRegressionTrainingSummary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Linear regression training results. Currently, the training summary ignores the
training weights except for the objective trace.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.coefficientStandardErrors">
<code class="descname">coefficientStandardErrors</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.coefficientStandardErrors" title="Permalink to this definition"></a></dt>
<dd><p>Standard error of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.degreesOfFreedom">
<code class="descname">degreesOfFreedom</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.degreesOfFreedom" title="Permalink to this definition"></a></dt>
<dd><p>Degrees of freedom.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.devianceResiduals">
<code class="descname">devianceResiduals</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.devianceResiduals" title="Permalink to this definition"></a></dt>
<dd><p>The weighted residuals, the usual residuals rescaled by the
square root of the instance weights.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.explainedVariance">
<code class="descname">explainedVariance</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.explainedVariance" title="Permalink to this definition"></a></dt>
<dd><p>Returns the explained variance regression score.
explainedVariance = 1 - variance(y - hat{y}) / variance(y)</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference external" href="http://en.wikipedia.org/wiki/Explained_variation">Wikipedia explain variation</a></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.featuresCol">
<code class="descname">featuresCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.featuresCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the features of each instance
as a vector.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.labelCol">
<code class="descname">labelCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.labelCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the true label of each
instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.meanAbsoluteError">
<code class="descname">meanAbsoluteError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.meanAbsoluteError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean absolute error, which is a risk function
corresponding to the expected value of the absolute error
loss or l1-norm loss.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.meanSquaredError">
<code class="descname">meanSquaredError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.meanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the mean squared error, which is a risk function
corresponding to the expected value of the squared error
loss or quadratic loss.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.numInstances">
<code class="descname">numInstances</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.numInstances" title="Permalink to this definition"></a></dt>
<dd><p>Number of instances in DataFrame predictions</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.objectiveHistory">
<code class="descname">objectiveHistory</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.objectiveHistory" title="Permalink to this definition"></a></dt>
<dd><p>Objective function (scaled loss + regularization) at each
iteration.
This value is only available when using the “l-bfgs” solver.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.pValues">
<code class="descname">pValues</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.pValues" title="Permalink to this definition"></a></dt>
<dd><p>Two-sided p-value of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.predictionCol">
<code class="descname">predictionCol</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.predictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Field in “predictions” which gives the predicted value of
the label at each instance.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.predictions">
<code class="descname">predictions</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.predictions" title="Permalink to this definition"></a></dt>
<dd><p>Dataframe outputted by the model’s <cite>transform</cite> method.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.r2">
<code class="descname">r2</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.r2" title="Permalink to this definition"></a></dt>
<dd><p>Returns R^2^, the coefficient of determination.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><cite>Wikipedia coefficient of determination &lt;http://en.wikipedia.org/wiki/Coefficient_of_determination&gt;</cite></p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.residuals">
<code class="descname">residuals</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.residuals" title="Permalink to this definition"></a></dt>
<dd><p>Residuals (label - predicted value)</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.rootMeanSquaredError">
<code class="descname">rootMeanSquaredError</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.rootMeanSquaredError" title="Permalink to this definition"></a></dt>
<dd><p>Returns the root mean squared error, which is defined as the
square root of the mean squared error.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This ignores instance weights (setting all to 1.0) from
<cite>LinearRegression.weightCol</cite>. This will change in later Spark
versions.</p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.tValues">
<code class="descname">tValues</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.tValues" title="Permalink to this definition"></a></dt>
<dd><p>T-statistic of estimated coefficients and intercept.
This value is only available when using the “normal” solver.</p>
<p>If <a class="reference internal" href="#pyspark.ml.regression.LinearRegression.fitIntercept" title="pyspark.ml.regression.LinearRegression.fitIntercept"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.fitIntercept</span></code></a> is set to True,
then the last element returned corresponds to the intercept.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.LinearRegressionTrainingSummary.totalIterations">
<code class="descname">totalIterations</code><a class="headerlink" href="#pyspark.ml.regression.LinearRegressionTrainingSummary.totalIterations" title="Permalink to this definition"></a></dt>
<dd><p>Number of training iterations until termination.
This value is only available when using the “l-bfgs” solver.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.LinearRegression.solver" title="pyspark.ml.regression.LinearRegression.solver"><code class="xref py py-attr docutils literal"><span class="pre">LinearRegression.solver</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.RandomForestRegressor">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">RandomForestRegressor</code><span class="sig-paren">(</span><em>featuresCol='features'</em>, <em>labelCol='label'</em>, <em>predictionCol='prediction'</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity='variance'</em>, <em>subsamplingRate=1.0</em>, <em>seed=None</em>, <em>numTrees=20</em>, <em>featureSubsetStrategy='auto'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#RandomForestRegressor"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor" title="Permalink to this definition"></a></dt>
<dd><p><a class="reference external" href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
learning algorithm for regression.
It supports both continuous and categorical features.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">numpy</span> <span class="k">import</span> <span class="n">allclose</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[]))],</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span> <span class="o">=</span> <span class="n">RandomForestRegressor</span><span class="p">(</span><span class="n">numTrees</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">maxDepth</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">rf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">SparseVector(1, {0: 1.0})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">allclose</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">treeWeights</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">])</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test0</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="o">-</span><span class="mf">1.0</span><span class="p">),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test0</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">numFeatures</span>
<span class="go">1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">trees</span>
<span class="go">[DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">getNumTrees</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">test1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test1</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span>
<span class="go">0.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rfr_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rfr&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">rfr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf2</span> <span class="o">=</span> <span class="n">RandomForestRegressor</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">rfr_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">rf2</span><span class="o">.</span><span class="n">getNumTrees</span><span class="p">()</span>
<span class="go">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/rfr_model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model2</span> <span class="o">=</span> <span class="n">RandomForestRegressionModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">model_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">featureImportances</span> <span class="o">==</span> <span class="n">model2</span><span class="o">.</span><span class="n">featureImportances</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.cacheNodeIds">
<code class="descname">cacheNodeIds</code><em class="property"> = Param(parent='undefined', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.cacheNodeIds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.checkpointInterval">
<code class="descname">checkpointInterval</code><em class="property"> = Param(parent='undefined', name='checkpointInterval', doc='set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.checkpointInterval" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.featureSubsetStrategy">
<code class="descname">featureSubsetStrategy</code><em class="property"> = Param(parent='undefined', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.featureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.featuresCol">
<code class="descname">featuresCol</code><em class="property"> = Param(parent='undefined', name='featuresCol', doc='features column name.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.featuresCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getCacheNodeIds">
<code class="descname">getCacheNodeIds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of cacheNodeIds or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getCheckpointInterval">
<code class="descname">getCheckpointInterval</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of checkpointInterval or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getFeatureSubsetStrategy">
<code class="descname">getFeatureSubsetStrategy</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getFeatureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featureSubsetStrategy or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getFeaturesCol">
<code class="descname">getFeaturesCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of featuresCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getImpurity">
<code class="descname">getImpurity</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of impurity or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getMaxBins">
<code class="descname">getMaxBins</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxBins or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getMaxDepth">
<code class="descname">getMaxDepth</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxDepth or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getMaxMemoryInMB">
<code class="descname">getMaxMemoryInMB</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of maxMemoryInMB or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getMinInfoGain">
<code class="descname">getMinInfoGain</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInfoGain or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getMinInstancesPerNode">
<code class="descname">getMinInstancesPerNode</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minInstancesPerNode or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getNumTrees">
<code class="descname">getNumTrees</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numTrees or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.getSubsamplingRate">
<code class="descname">getSubsamplingRate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.getSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of subsamplingRate or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.impurity">
<code class="descname">impurity</code><em class="property"> = Param(parent='undefined', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: variance')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.impurity" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.maxBins">
<code class="descname">maxBins</code><em class="property"> = Param(parent='undefined', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be &gt;=2 and &gt;= number of categories for any categorical feature.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.maxBins" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.maxDepth">
<code class="descname">maxDepth</code><em class="property"> = Param(parent='undefined', name='maxDepth', doc='Maximum depth of the tree. (&gt;= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.maxDepth" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.maxMemoryInMB">
<code class="descname">maxMemoryInMB</code><em class="property"> = Param(parent='undefined', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.maxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.minInfoGain">
<code class="descname">minInfoGain</code><em class="property"> = Param(parent='undefined', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.minInfoGain" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.minInstancesPerNode">
<code class="descname">minInstancesPerNode</code><em class="property"> = Param(parent='undefined', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be &gt;= 1.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.minInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.numTrees">
<code class="descname">numTrees</code><em class="property"> = Param(parent='undefined', name='numTrees', doc='Number of trees to train (&gt;= 1).')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.numTrees" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setCacheNodeIds">
<code class="descname">setCacheNodeIds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setCacheNodeIds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.cacheNodeIds" title="pyspark.ml.regression.RandomForestRegressor.cacheNodeIds"><code class="xref py py-attr docutils literal"><span class="pre">cacheNodeIds</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setCheckpointInterval">
<code class="descname">setCheckpointInterval</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setCheckpointInterval" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.checkpointInterval" title="pyspark.ml.regression.RandomForestRegressor.checkpointInterval"><code class="xref py py-attr docutils literal"><span class="pre">checkpointInterval</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setFeatureSubsetStrategy">
<code class="descname">setFeatureSubsetStrategy</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setFeatureSubsetStrategy" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.featureSubsetStrategy" title="pyspark.ml.regression.RandomForestRegressor.featureSubsetStrategy"><code class="xref py py-attr docutils literal"><span class="pre">featureSubsetStrategy</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setFeaturesCol">
<code class="descname">setFeaturesCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setFeaturesCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.featuresCol" title="pyspark.ml.regression.RandomForestRegressor.featuresCol"><code class="xref py py-attr docutils literal"><span class="pre">featuresCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setImpurity">
<code class="descname">setImpurity</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setImpurity" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.impurity" title="pyspark.ml.regression.RandomForestRegressor.impurity"><code class="xref py py-attr docutils literal"><span class="pre">impurity</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.labelCol" title="pyspark.ml.regression.RandomForestRegressor.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setMaxBins">
<code class="descname">setMaxBins</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setMaxBins" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.maxBins" title="pyspark.ml.regression.RandomForestRegressor.maxBins"><code class="xref py py-attr docutils literal"><span class="pre">maxBins</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setMaxDepth">
<code class="descname">setMaxDepth</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setMaxDepth" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.maxDepth" title="pyspark.ml.regression.RandomForestRegressor.maxDepth"><code class="xref py py-attr docutils literal"><span class="pre">maxDepth</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setMaxMemoryInMB">
<code class="descname">setMaxMemoryInMB</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setMaxMemoryInMB" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.maxMemoryInMB" title="pyspark.ml.regression.RandomForestRegressor.maxMemoryInMB"><code class="xref py py-attr docutils literal"><span class="pre">maxMemoryInMB</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setMinInfoGain">
<code class="descname">setMinInfoGain</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setMinInfoGain" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.minInfoGain" title="pyspark.ml.regression.RandomForestRegressor.minInfoGain"><code class="xref py py-attr docutils literal"><span class="pre">minInfoGain</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setMinInstancesPerNode">
<code class="descname">setMinInstancesPerNode</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setMinInstancesPerNode" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.minInstancesPerNode" title="pyspark.ml.regression.RandomForestRegressor.minInstancesPerNode"><code class="xref py py-attr docutils literal"><span class="pre">minInstancesPerNode</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setNumTrees">
<code class="descname">setNumTrees</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.numTrees" title="pyspark.ml.regression.RandomForestRegressor.numTrees"><code class="xref py py-attr docutils literal"><span class="pre">numTrees</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>featuresCol=&quot;features&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>maxDepth=5</em>, <em>maxBins=32</em>, <em>minInstancesPerNode=1</em>, <em>minInfoGain=0.0</em>, <em>maxMemoryInMB=256</em>, <em>cacheNodeIds=False</em>, <em>checkpointInterval=10</em>, <em>impurity=&quot;variance&quot;</em>, <em>subsamplingRate=1.0</em>, <em>seed=None</em>, <em>numTrees=20</em>, <em>featureSubsetStrategy=&quot;auto&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#RandomForestRegressor.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for linear regression.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.predictionCol" title="pyspark.ml.regression.RandomForestRegressor.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.seed" title="pyspark.ml.regression.RandomForestRegressor.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.setSubsamplingRate">
<code class="descname">setSubsamplingRate</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.setSubsamplingRate" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor.subsamplingRate" title="pyspark.ml.regression.RandomForestRegressor.subsamplingRate"><code class="xref py py-attr docutils literal"><span class="pre">subsamplingRate</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.subsamplingRate">
<code class="descname">subsamplingRate</code><em class="property"> = Param(parent='undefined', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].')</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.subsamplingRate" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.supportedFeatureSubsetStrategies">
<code class="descname">supportedFeatureSubsetStrategies</code><em class="property"> = ['auto', 'all', 'onethird', 'sqrt', 'log2']</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.supportedFeatureSubsetStrategies" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressor.supportedImpurities">
<code class="descname">supportedImpurities</code><em class="property"> = ['variance']</em><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.supportedImpurities" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressor.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressor.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.regression.RandomForestRegressionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.regression.</code><code class="descname">RandomForestRegressionModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/regression.html#RandomForestRegressionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel" title="Permalink to this definition"></a></dt>
<dd><p>Model fitted by <a class="reference internal" href="#pyspark.ml.regression.RandomForestRegressor" title="pyspark.ml.regression.RandomForestRegressor"><code class="xref py py-class docutils literal"><span class="pre">RandomForestRegressor</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.featureImportances">
<code class="descname">featureImportances</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.featureImportances" title="Permalink to this definition"></a></dt>
<dd><p>Estimate of the importance of each feature.</p>
<p>Each feature’s importance is the average of its importance across all trees in the ensemble
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
(Hastie, Tibshirani, Friedman. “The Elements of Statistical Learning, 2nd Edition.” 2001.)
and follows the implementation from scikit-learn.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<p class="last"><a class="reference internal" href="#pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances" title="pyspark.ml.regression.DecisionTreeRegressionModel.featureImportances"><code class="xref py py-attr docutils literal"><span class="pre">DecisionTreeRegressionModel.featureImportances</span></code></a></p>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.getNumTrees">
<code class="descname">getNumTrees</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.getNumTrees" title="Permalink to this definition"></a></dt>
<dd><p>Number of trees in ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.toDebugString">
<code class="descname">toDebugString</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.toDebugString" title="Permalink to this definition"></a></dt>
<dd><p>Full description of model.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.totalNumNodes">
<code class="descname">totalNumNodes</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.totalNumNodes" title="Permalink to this definition"></a></dt>
<dd><p>Total number of nodes, summed over all trees in the ensemble.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.treeWeights">
<code class="descname">treeWeights</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.treeWeights" title="Permalink to this definition"></a></dt>
<dd><p>Return the weights for each tree</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.trees">
<code class="descname">trees</code><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.trees" title="Permalink to this definition"></a></dt>
<dd><p>Trees in this ensemble. Warning: These have null parent Estimators.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.regression.RandomForestRegressionModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.regression.RandomForestRegressionModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.stat">
<span id="pyspark-ml-stat-module"></span><h2>pyspark.ml.stat module<a class="headerlink" href="#module-pyspark.ml.stat" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.stat.ChiSquareTest">
<em class="property">class </em><code class="descclassname">pyspark.ml.stat.</code><code class="descname">ChiSquareTest</code><a class="reference internal" href="_modules/pyspark/ml/stat.html#ChiSquareTest"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.stat.ChiSquareTest" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Conduct Pearson’s independence test for every feature against the label. For each feature,
the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared
statistic is computed. All label and feature values must be categorical.</p>
<p>The null hypothesis is that the occurrence of the outcomes is statistically independent.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – DataFrame of categorical labels and categorical features.
Real-valued features will be treated as categorical for each distinct value.</li>
<li><strong>featuresCol</strong> – Name of features column in dataset, of type <cite>Vector</cite> (<cite>VectorUDT</cite>).</li>
<li><strong>labelCol</strong> – Name of label column in dataset, of any numerical type.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">DataFrame containing the test result for every feature against the label.
This DataFrame will contain a single Row with the following fields:
- <cite>pValues: Vector</cite>
- <cite>degreesOfFreedom: Array[Int]</cite>
- <cite>statistics: Vector</cite>
Each of these fields has one value per feature.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.stat</span> <span class="k">import</span> <span class="n">ChiSquareTest</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="p">[[</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">chiSqResult</span> <span class="o">=</span> <span class="n">ChiSquareTest</span><span class="o">.</span><span class="n">test</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="s1">&#39;features&#39;</span><span class="p">,</span> <span class="s1">&#39;label&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">chiSqResult</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;degreesOfFreedom&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">Row(degreesOfFreedom=[3, 1, 0])</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.ml.stat.ChiSquareTest.test">
<em class="property">static </em><code class="descname">test</code><span class="sig-paren">(</span><em>dataset</em>, <em>featuresCol</em>, <em>labelCol</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/stat.html#ChiSquareTest.test"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.stat.ChiSquareTest.test" title="Permalink to this definition"></a></dt>
<dd><p>Perform a Pearson’s independence test using dataset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.stat.Correlation">
<em class="property">class </em><code class="descclassname">pyspark.ml.stat.</code><code class="descname">Correlation</code><a class="reference internal" href="_modules/pyspark/ml/stat.html#Correlation"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.stat.Correlation" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Compute the correlation matrix for the input dataset of Vectors using the specified method.
Methods currently supported: <cite>pearson</cite> (default), <cite>spearman</cite>.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">For Spearman, a rank correlation, we need to create an RDD[Double] for each column
and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
which is fairly costly. Cache the input Dataset before calling corr with <cite>method = ‘spearman’</cite>
to avoid recomputing the common lineage.</p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – A dataset or a dataframe.</li>
<li><strong>column</strong> – The name of the column of vectors for which the correlation coefficient needs
to be computed. This must be a column of the dataset, and it must contain
Vector objects.</li>
<li><strong>method</strong> – String specifying the method to use for computing correlation.
Supported: <cite>pearson</cite> (default), <cite>spearman</cite>.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dataframe that contains the correlation matrix of the column of vectors. This
dataframe contains a single row and a single column of name
‘$METHODNAME($COLUMN)’.</p>
</td>
</tr>
</tbody>
</table>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.stat</span> <span class="k">import</span> <span class="n">Correlation</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="p">[[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="mi">2</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">6</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">])],</span>
<span class="gp">... </span> <span class="p">[</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mi">9</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])]]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">[</span><span class="s1">&#39;features&#39;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pearsonCorr</span> <span class="o">=</span> <span class="n">Correlation</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="s1">&#39;features&#39;</span><span class="p">,</span> <span class="s1">&#39;pearson&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">pearsonCorr</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;nan&#39;</span><span class="p">,</span> <span class="s1">&#39;NaN&#39;</span><span class="p">))</span>
<span class="go">DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...],</span>
<span class="go"> [ 0.0556..., 1. , NaN, 0.9135...],</span>
<span class="go"> [ NaN, NaN, 1. , NaN],</span>
<span class="go"> [ 0.4004..., 0.9135..., NaN, 1. ]])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">spearmanCorr</span> <span class="o">=</span> <span class="n">Correlation</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="s1">&#39;features&#39;</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;spearman&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">print</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">spearmanCorr</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;nan&#39;</span><span class="p">,</span> <span class="s1">&#39;NaN&#39;</span><span class="p">))</span>
<span class="go">DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ],</span>
<span class="go"> [ 0.1054..., 1. , NaN, 0.9486... ],</span>
<span class="go"> [ NaN, NaN, 1. , NaN],</span>
<span class="go"> [ 0.4 , 0.9486... , NaN, 1. ]])</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="staticmethod">
<dt id="pyspark.ml.stat.Correlation.corr">
<em class="property">static </em><code class="descname">corr</code><span class="sig-paren">(</span><em>dataset</em>, <em>column</em>, <em>method='pearson'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/stat.html#Correlation.corr"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.stat.Correlation.corr" title="Permalink to this definition"></a></dt>
<dd><p>Compute the correlation matrix with specified method using dataset.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.tuning">
<span id="pyspark-ml-tuning-module"></span><h2>pyspark.ml.tuning module<a class="headerlink" href="#module-pyspark.ml.tuning" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.tuning.ParamGridBuilder">
<em class="property">class </em><code class="descclassname">pyspark.ml.tuning.</code><code class="descname">ParamGridBuilder</code><a class="reference internal" href="_modules/pyspark/ml/tuning.html#ParamGridBuilder"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.ParamGridBuilder" title="Permalink to this definition"></a></dt>
<dd><p>Builder for a param grid used in grid search-based model selection.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.classification</span> <span class="k">import</span> <span class="n">LogisticRegression</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="n">ParamGridBuilder</span><span class="p">()</span> \
<span class="gp">... </span> <span class="o">.</span><span class="n">baseOn</span><span class="p">({</span><span class="n">lr</span><span class="o">.</span><span class="n">labelCol</span><span class="p">:</span> <span class="s1">&#39;l&#39;</span><span class="p">})</span> \
<span class="gp">... </span> <span class="o">.</span><span class="n">baseOn</span><span class="p">([</span><span class="n">lr</span><span class="o">.</span><span class="n">predictionCol</span><span class="p">,</span> <span class="s1">&#39;p&#39;</span><span class="p">])</span> \
<span class="gp">... </span> <span class="o">.</span><span class="n">addGrid</span><span class="p">(</span><span class="n">lr</span><span class="o">.</span><span class="n">regParam</span><span class="p">,</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])</span> \
<span class="gp">... </span> <span class="o">.</span><span class="n">addGrid</span><span class="p">(</span><span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">])</span> \
<span class="gp">... </span> <span class="o">.</span><span class="n">build</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">expected</span> <span class="o">=</span> <span class="p">[</span>
<span class="gp">... </span> <span class="p">{</span><span class="n">lr</span><span class="o">.</span><span class="n">regParam</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">labelCol</span><span class="p">:</span> <span class="s1">&#39;l&#39;</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">predictionCol</span><span class="p">:</span> <span class="s1">&#39;p&#39;</span><span class="p">},</span>
<span class="gp">... </span> <span class="p">{</span><span class="n">lr</span><span class="o">.</span><span class="n">regParam</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">labelCol</span><span class="p">:</span> <span class="s1">&#39;l&#39;</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">predictionCol</span><span class="p">:</span> <span class="s1">&#39;p&#39;</span><span class="p">},</span>
<span class="gp">... </span> <span class="p">{</span><span class="n">lr</span><span class="o">.</span><span class="n">regParam</span><span class="p">:</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">:</span> <span class="mi">5</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">labelCol</span><span class="p">:</span> <span class="s1">&#39;l&#39;</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">predictionCol</span><span class="p">:</span> <span class="s1">&#39;p&#39;</span><span class="p">},</span>
<span class="gp">... </span> <span class="p">{</span><span class="n">lr</span><span class="o">.</span><span class="n">regParam</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">:</span> <span class="mi">5</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">labelCol</span><span class="p">:</span> <span class="s1">&#39;l&#39;</span><span class="p">,</span> <span class="n">lr</span><span class="o">.</span><span class="n">predictionCol</span><span class="p">:</span> <span class="s1">&#39;p&#39;</span><span class="p">}]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">len</span><span class="p">(</span><span class="n">output</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">expected</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">all</span><span class="p">([</span><span class="n">m</span> <span class="ow">in</span> <span class="n">expected</span> <span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">output</span><span class="p">])</span>
<span class="go">True</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.tuning.ParamGridBuilder.addGrid">
<code class="descname">addGrid</code><span class="sig-paren">(</span><em>param</em>, <em>values</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#ParamGridBuilder.addGrid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.ParamGridBuilder.addGrid" title="Permalink to this definition"></a></dt>
<dd><p>Sets the given parameters in this grid to fixed values.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.ParamGridBuilder.baseOn">
<code class="descname">baseOn</code><span class="sig-paren">(</span><em>*args</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#ParamGridBuilder.baseOn"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.ParamGridBuilder.baseOn" title="Permalink to this definition"></a></dt>
<dd><p>Sets the given parameters in this grid to fixed values.
Accepts either a parameter dictionary or a list of (parameter, value) pairs.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.ParamGridBuilder.build">
<code class="descname">build</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#ParamGridBuilder.build"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.ParamGridBuilder.build" title="Permalink to this definition"></a></dt>
<dd><p>Builds and returns all combinations of parameters specified
by the param grid.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.tuning.CrossValidator">
<em class="property">class </em><code class="descclassname">pyspark.ml.tuning.</code><code class="descname">CrossValidator</code><span class="sig-paren">(</span><em>estimator=None</em>, <em>estimatorParamMaps=None</em>, <em>evaluator=None</em>, <em>numFolds=3</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator" title="Permalink to this definition"></a></dt>
<dd><p>K-fold cross validation performs model selection by splitting the dataset into a set of
non-overlapping randomly partitioned folds which are used as separate training and test datasets
e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test) dataset pairs,
each of which uses 2/3 of the data for training and 1/3 for testing. Each fold is used as the
test set exactly once.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.classification</span> <span class="k">import</span> <span class="n">LogisticRegression</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.evaluation</span> <span class="k">import</span> <span class="n">BinaryClassificationEvaluator</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.4</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.5</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.6</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">)]</span> <span class="o">*</span> <span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">grid</span> <span class="o">=</span> <span class="n">ParamGridBuilder</span><span class="p">()</span><span class="o">.</span><span class="n">addGrid</span><span class="p">(</span><span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span><span class="o">.</span><span class="n">build</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span> <span class="o">=</span> <span class="n">BinaryClassificationEvaluator</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cv</span> <span class="o">=</span> <span class="n">CrossValidator</span><span class="p">(</span><span class="n">estimator</span><span class="o">=</span><span class="n">lr</span><span class="p">,</span> <span class="n">estimatorParamMaps</span><span class="o">=</span><span class="n">grid</span><span class="p">,</span> <span class="n">evaluator</span><span class="o">=</span><span class="n">evaluator</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cvModel</span> <span class="o">=</span> <span class="n">cv</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">cvModel</span><span class="o">.</span><span class="n">avgMetrics</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">0.5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">cvModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">))</span>
<span class="go">0.8333...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidator.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This copies creates a deep copy of
the embedded paramMap, and copies the embedded and extra parameters over.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.estimator">
<code class="descname">estimator</code><em class="property"> = Param(parent='undefined', name='estimator', doc='estimator to be cross-validated')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.estimator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.estimatorParamMaps">
<code class="descname">estimatorParamMaps</code><em class="property"> = Param(parent='undefined', name='estimatorParamMaps', doc='estimator param maps')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.estimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.evaluator">
<code class="descname">evaluator</code><em class="property"> = Param(parent='undefined', name='evaluator', doc='evaluator used to select hyper-parameters that maximize the validator metric')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.evaluator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getEstimator">
<code class="descname">getEstimator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getEstimatorParamMaps">
<code class="descname">getEstimatorParamMaps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimatorParamMaps or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getEvaluator">
<code class="descname">getEvaluator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of evaluator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getNumFolds">
<code class="descname">getNumFolds</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidator.getNumFolds"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getNumFolds" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of numFolds or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.numFolds">
<code class="descname">numFolds</code><em class="property"> = Param(parent='undefined', name='numFolds', doc='number of folds for cross validation')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.numFolds" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidator.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setEstimator">
<code class="descname">setEstimator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator.estimator" title="pyspark.ml.tuning.CrossValidator.estimator"><code class="xref py py-attr docutils literal"><span class="pre">estimator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setEstimatorParamMaps">
<code class="descname">setEstimatorParamMaps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator.estimatorParamMaps" title="pyspark.ml.tuning.CrossValidator.estimatorParamMaps"><code class="xref py py-attr docutils literal"><span class="pre">estimatorParamMaps</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setEvaluator">
<code class="descname">setEvaluator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator.evaluator" title="pyspark.ml.tuning.CrossValidator.evaluator"><code class="xref py py-attr docutils literal"><span class="pre">evaluator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setNumFolds">
<code class="descname">setNumFolds</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidator.setNumFolds"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setNumFolds" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator.numFolds" title="pyspark.ml.tuning.CrossValidator.numFolds"><code class="xref py py-attr docutils literal"><span class="pre">numFolds</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>estimator=None</em>, <em>estimatorParamMaps=None</em>, <em>evaluator=None</em>, <em>numFolds=3</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidator.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=None):
Sets params for cross validator.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidator.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidator.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator.seed" title="pyspark.ml.tuning.CrossValidator.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.tuning.CrossValidatorModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.tuning.</code><code class="descname">CrossValidatorModel</code><span class="sig-paren">(</span><em>bestModel</em>, <em>avgMetrics=[]</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidatorModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel" title="Permalink to this definition"></a></dt>
<dd><p>CrossValidatorModel contains the model with the highest average cross-validation
metric across folds and uses this model to transform input data. CrossValidatorModel
also tracks the metrics for each param map evaluated.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.avgMetrics">
<code class="descname">avgMetrics</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.avgMetrics" title="Permalink to this definition"></a></dt>
<dd><p>Average cross-validation metrics for each paramMap in
CrossValidator.estimatorParamMaps, in the corresponding order.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.bestModel">
<code class="descname">bestModel</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.bestModel" title="Permalink to this definition"></a></dt>
<dd><p>best model from cross validation</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#CrossValidatorModel.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This copies the underlying bestModel,
creates a deep copy of the embedded paramMap, and
copies the embedded and extra parameters over.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.estimator">
<code class="descname">estimator</code><em class="property"> = Param(parent='undefined', name='estimator', doc='estimator to be cross-validated')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.estimator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.estimatorParamMaps">
<code class="descname">estimatorParamMaps</code><em class="property"> = Param(parent='undefined', name='estimatorParamMaps', doc='estimator param maps')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.estimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.evaluator">
<code class="descname">evaluator</code><em class="property"> = Param(parent='undefined', name='evaluator', doc='evaluator used to select hyper-parameters that maximize the validator metric')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.evaluator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getEstimator">
<code class="descname">getEstimator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getEstimatorParamMaps">
<code class="descname">getEstimatorParamMaps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimatorParamMaps or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getEvaluator">
<code class="descname">getEvaluator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of evaluator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.CrossValidatorModel.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.setEstimator">
<code class="descname">setEstimator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.setEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidatorModel.estimator" title="pyspark.ml.tuning.CrossValidatorModel.estimator"><code class="xref py py-attr docutils literal"><span class="pre">estimator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.setEstimatorParamMaps">
<code class="descname">setEstimatorParamMaps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.setEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidatorModel.estimatorParamMaps" title="pyspark.ml.tuning.CrossValidatorModel.estimatorParamMaps"><code class="xref py py-attr docutils literal"><span class="pre">estimatorParamMaps</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.setEvaluator">
<code class="descname">setEvaluator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.setEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidatorModel.evaluator" title="pyspark.ml.tuning.CrossValidatorModel.evaluator"><code class="xref py py-attr docutils literal"><span class="pre">evaluator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.CrossValidatorModel.seed" title="pyspark.ml.tuning.CrossValidatorModel.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.CrossValidatorModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.CrossValidatorModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.tuning.TrainValidationSplit">
<em class="property">class </em><code class="descclassname">pyspark.ml.tuning.</code><code class="descname">TrainValidationSplit</code><span class="sig-paren">(</span><em>estimator=None</em>, <em>estimatorParamMaps=None</em>, <em>evaluator=None</em>, <em>trainRatio=0.75</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplit"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Validation for hyper-parameter tuning. Randomly splits the input dataset into train and
validation sets, and uses evaluation metric on the validation set to select the best model.
Similar to <a class="reference internal" href="#pyspark.ml.tuning.CrossValidator" title="pyspark.ml.tuning.CrossValidator"><code class="xref py py-class docutils literal"><span class="pre">CrossValidator</span></code></a>, but only splits the set once.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.classification</span> <span class="k">import</span> <span class="n">LogisticRegression</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.evaluation</span> <span class="k">import</span> <span class="n">BinaryClassificationEvaluator</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.4</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.5</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.6</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">)]</span> <span class="o">*</span> <span class="mi">10</span><span class="p">,</span>
<span class="gp">... </span> <span class="p">[</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">grid</span> <span class="o">=</span> <span class="n">ParamGridBuilder</span><span class="p">()</span><span class="o">.</span><span class="n">addGrid</span><span class="p">(</span><span class="n">lr</span><span class="o">.</span><span class="n">maxIter</span><span class="p">,</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span><span class="o">.</span><span class="n">build</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span> <span class="o">=</span> <span class="n">BinaryClassificationEvaluator</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tvs</span> <span class="o">=</span> <span class="n">TrainValidationSplit</span><span class="p">(</span><span class="n">estimator</span><span class="o">=</span><span class="n">lr</span><span class="p">,</span> <span class="n">estimatorParamMaps</span><span class="o">=</span><span class="n">grid</span><span class="p">,</span> <span class="n">evaluator</span><span class="o">=</span><span class="n">evaluator</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tvsModel</span> <span class="o">=</span> <span class="n">tvs</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">tvsModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">))</span>
<span class="go">0.8333...</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplit.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This copies creates a deep copy of
the embedded paramMap, and copies the embedded and extra parameters over.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.estimator">
<code class="descname">estimator</code><em class="property"> = Param(parent='undefined', name='estimator', doc='estimator to be cross-validated')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.estimator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.estimatorParamMaps">
<code class="descname">estimatorParamMaps</code><em class="property"> = Param(parent='undefined', name='estimatorParamMaps', doc='estimator param maps')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.estimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.evaluator">
<code class="descname">evaluator</code><em class="property"> = Param(parent='undefined', name='evaluator', doc='evaluator used to select hyper-parameters that maximize the validator metric')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.evaluator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getEstimator">
<code class="descname">getEstimator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getEstimatorParamMaps">
<code class="descname">getEstimatorParamMaps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimatorParamMaps or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getEvaluator">
<code class="descname">getEvaluator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of evaluator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.getTrainRatio">
<code class="descname">getTrainRatio</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplit.getTrainRatio"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.getTrainRatio" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of trainRatio or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setEstimator">
<code class="descname">setEstimator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplit.estimator" title="pyspark.ml.tuning.TrainValidationSplit.estimator"><code class="xref py py-attr docutils literal"><span class="pre">estimator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setEstimatorParamMaps">
<code class="descname">setEstimatorParamMaps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplit.estimatorParamMaps" title="pyspark.ml.tuning.TrainValidationSplit.estimatorParamMaps"><code class="xref py py-attr docutils literal"><span class="pre">estimatorParamMaps</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setEvaluator">
<code class="descname">setEvaluator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplit.evaluator" title="pyspark.ml.tuning.TrainValidationSplit.evaluator"><code class="xref py py-attr docutils literal"><span class="pre">evaluator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>estimator=None</em>, <em>estimatorParamMaps=None</em>, <em>evaluator=None</em>, <em>trainRatio=0.75</em>, <em>seed=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplit.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setParams" title="Permalink to this definition"></a></dt>
<dd><p>setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, seed=None):
Sets params for the train validation split.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplit.seed" title="pyspark.ml.tuning.TrainValidationSplit.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplit.setTrainRatio">
<code class="descname">setTrainRatio</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplit.setTrainRatio"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.setTrainRatio" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplit.trainRatio" title="pyspark.ml.tuning.TrainValidationSplit.trainRatio"><code class="xref py py-attr docutils literal"><span class="pre">trainRatio</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplit.trainRatio">
<code class="descname">trainRatio</code><em class="property"> = Param(parent='undefined', name='trainRatio', doc='Param for ratio between train and validation data. Must be between 0 and 1.')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplit.trainRatio" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.tuning.</code><code class="descname">TrainValidationSplitModel</code><span class="sig-paren">(</span><em>bestModel</em>, <em>validationMetrics=[]</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplitModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model from train validation split.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.bestModel">
<code class="descname">bestModel</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.bestModel" title="Permalink to this definition"></a></dt>
<dd><p>best model from cross validation</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/tuning.html#TrainValidationSplitModel.copy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with a randomly generated uid
and some extra params. This copies the underlying bestModel,
creates a deep copy of the embedded paramMap, and
copies the embedded and extra parameters over.
And, this creates a shallow copy of the validationMetrics.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.estimator">
<code class="descname">estimator</code><em class="property"> = Param(parent='undefined', name='estimator', doc='estimator to be cross-validated')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.estimator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.estimatorParamMaps">
<code class="descname">estimatorParamMaps</code><em class="property"> = Param(parent='undefined', name='estimatorParamMaps', doc='estimator param maps')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.estimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.evaluator">
<code class="descname">evaluator</code><em class="property"> = Param(parent='undefined', name='evaluator', doc='evaluator used to select hyper-parameters that maximize the validator metric')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.evaluator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getEstimator">
<code class="descname">getEstimator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getEstimatorParamMaps">
<code class="descname">getEstimatorParamMaps</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of estimatorParamMaps or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getEvaluator">
<code class="descname">getEvaluator</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of evaluator or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.getSeed">
<code class="descname">getSeed</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.getSeed" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of seed or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.seed">
<code class="descname">seed</code><em class="property"> = Param(parent='undefined', name='seed', doc='random seed.')</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.seed" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.setEstimator">
<code class="descname">setEstimator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.setEstimator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplitModel.estimator" title="pyspark.ml.tuning.TrainValidationSplitModel.estimator"><code class="xref py py-attr docutils literal"><span class="pre">estimator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.setEstimatorParamMaps">
<code class="descname">setEstimatorParamMaps</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.setEstimatorParamMaps" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplitModel.estimatorParamMaps" title="pyspark.ml.tuning.TrainValidationSplitModel.estimatorParamMaps"><code class="xref py py-attr docutils literal"><span class="pre">estimatorParamMaps</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.setEvaluator">
<code class="descname">setEvaluator</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.setEvaluator" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplitModel.evaluator" title="pyspark.ml.tuning.TrainValidationSplitModel.evaluator"><code class="xref py py-attr docutils literal"><span class="pre">evaluator</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.setSeed">
<code class="descname">setSeed</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.setSeed" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.tuning.TrainValidationSplitModel.seed" title="pyspark.ml.tuning.TrainValidationSplitModel.seed"><code class="xref py py-attr docutils literal"><span class="pre">seed</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.tuning.TrainValidationSplitModel.validationMetrics">
<code class="descname">validationMetrics</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.tuning.TrainValidationSplitModel.validationMetrics" title="Permalink to this definition"></a></dt>
<dd><p>evaluated validation metrics</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.evaluation">
<span id="pyspark-ml-evaluation-module"></span><h2>pyspark.ml.evaluation module<a class="headerlink" href="#module-pyspark.ml.evaluation" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.evaluation.Evaluator">
<em class="property">class </em><code class="descclassname">pyspark.ml.evaluation.</code><code class="descname">Evaluator</code><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#Evaluator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator" title="Permalink to this definition"></a></dt>
<dd><p>Base class for evaluators that compute metrics from predictions.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. The default implementation creates a
shallow copy using <code class="xref py py-func docutils literal"><span class="pre">copy.copy()</span></code>, and then copies the
embedded and extra parameters over and returns the copy.
Subclasses should override this method if the default approach
is not sufficient.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#Evaluator.evaluate"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the output with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – a dataset that contains labels/observations and
predictions</li>
<li><strong>params</strong> – an optional param map that overrides embedded
params</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">metric</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.isLargerBetter">
<code class="descname">isLargerBetter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#Evaluator.isLargerBetter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.isLargerBetter" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether the metric returned by <a class="reference internal" href="#pyspark.ml.evaluation.Evaluator.evaluate" title="pyspark.ml.evaluation.Evaluator.evaluate"><code class="xref py py-meth docutils literal"><span class="pre">evaluate()</span></code></a> should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.Evaluator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.Evaluator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.evaluation.Evaluator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator">
<em class="property">class </em><code class="descclassname">pyspark.ml.evaluation.</code><code class="descname">BinaryClassificationEvaluator</code><span class="sig-paren">(</span><em>rawPredictionCol='rawPrediction'</em>, <em>labelCol='label'</em>, <em>metricName='areaUnderROC'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#BinaryClassificationEvaluator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Evaluator for binary classification, which expects two input columns: rawPrediction and label.
The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="k">import</span> <span class="n">Vectors</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span> <span class="o">-</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]]),</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">]),</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.4</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.8</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;raw&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span> <span class="o">=</span> <span class="n">BinaryClassificationEvaluator</span><span class="p">(</span><span class="n">rawPredictionCol</span><span class="o">=</span><span class="s2">&quot;raw&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="go">0.70...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">{</span><span class="n">evaluator</span><span class="o">.</span><span class="n">metricName</span><span class="p">:</span> <span class="s2">&quot;areaUnderPR&quot;</span><span class="p">})</span>
<span class="go">0.83...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">bce_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/bce&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">bce_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator2</span> <span class="o">=</span> <span class="n">BinaryClassificationEvaluator</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">bce_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">str</span><span class="p">(</span><span class="n">evaluator2</span><span class="o">.</span><span class="n">getRawPredictionCol</span><span class="p">())</span>
<span class="go">&#39;raw&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the output with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – a dataset that contains labels/observations and
predictions</li>
<li><strong>params</strong> – an optional param map that overrides embedded
params</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">metric</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.getMetricName">
<code class="descname">getMetricName</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#BinaryClassificationEvaluator.getMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.getMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of metricName or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.getRawPredictionCol">
<code class="descname">getRawPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.getRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of rawPredictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.isLargerBetter">
<code class="descname">isLargerBetter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.isLargerBetter" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether the metric returned by <a class="reference internal" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.evaluate" title="pyspark.ml.evaluation.BinaryClassificationEvaluator.evaluate"><code class="xref py py-meth docutils literal"><span class="pre">evaluate()</span></code></a> should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.metricName">
<code class="descname">metricName</code><em class="property"> = Param(parent='undefined', name='metricName', doc='metric name in evaluation (areaUnderROC|areaUnderPR)')</em><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.metricName" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.rawPredictionCol">
<code class="descname">rawPredictionCol</code><em class="property"> = Param(parent='undefined', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.rawPredictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.labelCol" title="pyspark.ml.evaluation.BinaryClassificationEvaluator.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.setMetricName">
<code class="descname">setMetricName</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#BinaryClassificationEvaluator.setMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.setMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.metricName" title="pyspark.ml.evaluation.BinaryClassificationEvaluator.metricName"><code class="xref py py-attr docutils literal"><span class="pre">metricName</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>rawPredictionCol=&quot;rawPrediction&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>metricName=&quot;areaUnderROC&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#BinaryClassificationEvaluator.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for binary classification evaluator.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.setRawPredictionCol">
<code class="descname">setRawPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.setRawPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.rawPredictionCol" title="pyspark.ml.evaluation.BinaryClassificationEvaluator.rawPredictionCol"><code class="xref py py-attr docutils literal"><span class="pre">rawPredictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.BinaryClassificationEvaluator.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.BinaryClassificationEvaluator.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.evaluation.RegressionEvaluator">
<em class="property">class </em><code class="descclassname">pyspark.ml.evaluation.</code><code class="descname">RegressionEvaluator</code><span class="sig-paren">(</span><em>predictionCol='prediction'</em>, <em>labelCol='label'</em>, <em>metricName='rmse'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#RegressionEvaluator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Evaluator for Regression, which expects two input
columns: prediction and label.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="p">[(</span><span class="o">-</span><span class="mf">28.98343821</span><span class="p">,</span> <span class="o">-</span><span class="mf">27.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">20.21491975</span><span class="p">,</span> <span class="mf">21.5</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="o">-</span><span class="mf">25.98418959</span><span class="p">,</span> <span class="o">-</span><span class="mf">22.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">30.69731842</span><span class="p">,</span> <span class="mf">33.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">74.69283752</span><span class="p">,</span> <span class="mf">71.0</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;raw&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span> <span class="o">=</span> <span class="n">RegressionEvaluator</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="s2">&quot;raw&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="go">2.842...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">{</span><span class="n">evaluator</span><span class="o">.</span><span class="n">metricName</span><span class="p">:</span> <span class="s2">&quot;r2&quot;</span><span class="p">})</span>
<span class="go">0.993...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">{</span><span class="n">evaluator</span><span class="o">.</span><span class="n">metricName</span><span class="p">:</span> <span class="s2">&quot;mae&quot;</span><span class="p">})</span>
<span class="go">2.649...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">re_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/re&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">re_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator2</span> <span class="o">=</span> <span class="n">RegressionEvaluator</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">re_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">str</span><span class="p">(</span><span class="n">evaluator2</span><span class="o">.</span><span class="n">getPredictionCol</span><span class="p">())</span>
<span class="go">&#39;raw&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the output with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – a dataset that contains labels/observations and
predictions</li>
<li><strong>params</strong> – an optional param map that overrides embedded
params</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">metric</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.getMetricName">
<code class="descname">getMetricName</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#RegressionEvaluator.getMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.getMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of metricName or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.isLargerBetter">
<code class="descname">isLargerBetter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.isLargerBetter" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether the metric returned by <a class="reference internal" href="#pyspark.ml.evaluation.RegressionEvaluator.evaluate" title="pyspark.ml.evaluation.RegressionEvaluator.evaluate"><code class="xref py py-meth docutils literal"><span class="pre">evaluate()</span></code></a> should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.metricName">
<code class="descname">metricName</code><em class="property"> = Param(parent='undefined', name='metricName', doc='metric name in evaluation - one of:\n rmse - root mean squared error (default)\n mse - mean squared error\n r2 - r^2 metric\n mae - mean absolute error.')</em><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.metricName" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.RegressionEvaluator.labelCol" title="pyspark.ml.evaluation.RegressionEvaluator.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.setMetricName">
<code class="descname">setMetricName</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#RegressionEvaluator.setMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.setMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.RegressionEvaluator.metricName" title="pyspark.ml.evaluation.RegressionEvaluator.metricName"><code class="xref py py-attr docutils literal"><span class="pre">metricName</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>metricName=&quot;rmse&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#RegressionEvaluator.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for regression evaluator.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.RegressionEvaluator.predictionCol" title="pyspark.ml.evaluation.RegressionEvaluator.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.RegressionEvaluator.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.RegressionEvaluator.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator">
<em class="property">class </em><code class="descclassname">pyspark.ml.evaluation.</code><code class="descname">MulticlassClassificationEvaluator</code><span class="sig-paren">(</span><em>predictionCol='prediction'</em>, <em>labelCol='label'</em>, <em>metricName='f1'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#MulticlassClassificationEvaluator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Evaluator for Multiclass Classification, which expects two input
columns: prediction and label.</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="p">[(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span>
<span class="gp">... </span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;prediction&quot;</span><span class="p">,</span> <span class="s2">&quot;label&quot;</span><span class="p">])</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span> <span class="o">=</span> <span class="n">MulticlassClassificationEvaluator</span><span class="p">(</span><span class="n">predictionCol</span><span class="o">=</span><span class="s2">&quot;prediction&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="go">0.66...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">evaluate</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">{</span><span class="n">evaluator</span><span class="o">.</span><span class="n">metricName</span><span class="p">:</span> <span class="s2">&quot;accuracy&quot;</span><span class="p">})</span>
<span class="go">0.66...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mce_path</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/mce&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">mce_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">evaluator2</span> <span class="o">=</span> <span class="n">MulticlassClassificationEvaluator</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">mce_path</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">str</span><span class="p">(</span><span class="n">evaluator2</span><span class="o">.</span><span class="n">getPredictionCol</span><span class="p">())</span>
<span class="go">&#39;prediction&#39;</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.evaluate">
<code class="descname">evaluate</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.evaluate" title="Permalink to this definition"></a></dt>
<dd><p>Evaluates the output with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – a dataset that contains labels/observations and
predictions</li>
<li><strong>params</strong> – an optional param map that overrides embedded
params</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">metric</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.getLabelCol">
<code class="descname">getLabelCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.getLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of labelCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.getMetricName">
<code class="descname">getMetricName</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#MulticlassClassificationEvaluator.getMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.getMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of metricName or its default value.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.isLargerBetter">
<code class="descname">isLargerBetter</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.isLargerBetter" title="Permalink to this definition"></a></dt>
<dd><p>Indicates whether the metric returned by <a class="reference internal" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.evaluate" title="pyspark.ml.evaluation.MulticlassClassificationEvaluator.evaluate"><code class="xref py py-meth docutils literal"><span class="pre">evaluate()</span></code></a> should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.labelCol">
<code class="descname">labelCol</code><em class="property"> = Param(parent='undefined', name='labelCol', doc='label column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.labelCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName">
<code class="descname">metricName</code><em class="property"> = Param(parent='undefined', name='metricName', doc='metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy)')</em><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.setLabelCol">
<code class="descname">setLabelCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.setLabelCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.labelCol" title="pyspark.ml.evaluation.MulticlassClassificationEvaluator.labelCol"><code class="xref py py-attr docutils literal"><span class="pre">labelCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.setMetricName">
<code class="descname">setMetricName</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#MulticlassClassificationEvaluator.setMetricName"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.setMetricName" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName" title="pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName"><code class="xref py py-attr docutils literal"><span class="pre">metricName</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>labelCol=&quot;label&quot;</em>, <em>metricName=&quot;f1&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/evaluation.html#MulticlassClassificationEvaluator.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for multiclass classification evaluator.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.predictionCol" title="pyspark.ml.evaluation.MulticlassClassificationEvaluator.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.evaluation.MulticlassClassificationEvaluator.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.evaluation.MulticlassClassificationEvaluator.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.fpm">
<span id="pyspark-ml-fpm-module"></span><h2>pyspark.ml.fpm module<a class="headerlink" href="#module-pyspark.ml.fpm" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.fpm.FPGrowth">
<em class="property">class </em><code class="descclassname">pyspark.ml.fpm.</code><code class="descname">FPGrowth</code><span class="sig-paren">(</span><em>minSupport=0.3</em>, <em>minConfidence=0.8</em>, <em>itemsCol='items'</em>, <em>predictionCol='prediction'</em>, <em>numPartitions=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/fpm.html#FPGrowth"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
Li et al., PFP: Parallel FP-Growth for Query Recommendation <a class="reference internal" href="#li2008" id="id6">[LI2008]</a>.
PFP distributes computation in such a way that each worker executes an
independent group of mining tasks. The FP-Growth algorithm is described in
Han et al., Mining frequent patterns without candidate generation <a class="reference internal" href="#han2000" id="id7">[HAN2000]</a></p>
<table class="docutils citation" frame="void" id="li2008" rules="none">
<colgroup><col class="label" /><col /></colgroup>
<tbody valign="top">
<tr><td class="label"><a class="fn-backref" href="#id6">[LI2008]</a></td><td><a class="reference external" href="http://dx.doi.org/10.1145/1454008.1454027">http://dx.doi.org/10.1145/1454008.1454027</a></td></tr>
</tbody>
</table>
<table class="docutils citation" frame="void" id="han2000" rules="none">
<colgroup><col class="label" /><col /></colgroup>
<tbody valign="top">
<tr><td class="label"><a class="fn-backref" href="#id7">[HAN2000]</a></td><td><a class="reference external" href="http://dx.doi.org/10.1145/335191.335372">http://dx.doi.org/10.1145/335191.335372</a></td></tr>
</tbody>
</table>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">null values in the feature column are ignored during fit().</p>
</div>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Internally <cite>transform</cite> <cite>collects</cite> and <cite>broadcasts</cite> association rules.</p>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="k">import</span> <span class="n">split</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="p">(</span><span class="n">spark</span><span class="o">.</span><span class="n">read</span>
<span class="gp">... </span> <span class="o">.</span><span class="n">text</span><span class="p">(</span><span class="s2">&quot;data/mllib/sample_fpgrowth.txt&quot;</span><span class="p">)</span>
<span class="gp">... </span> <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;value&quot;</span><span class="p">,</span> <span class="s2">&quot;\s+&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;items&quot;</span><span class="p">)))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="go">+------------------------+</span>
<span class="go">|items |</span>
<span class="go">+------------------------+</span>
<span class="go">|[r, z, h, k, p] |</span>
<span class="go">|[z, y, x, w, v, u, t, s]|</span>
<span class="go">|[s, x, o, n, r] |</span>
<span class="go">|[x, z, y, m, t, s, q, e]|</span>
<span class="go">|[z] |</span>
<span class="go">|[x, z, y, r, q, t, p] |</span>
<span class="go">+------------------------+</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fp</span> <span class="o">=</span> <span class="n">FPGrowth</span><span class="p">(</span><span class="n">minSupport</span><span class="o">=</span><span class="mf">0.2</span><span class="p">,</span> <span class="n">minConfidence</span><span class="o">=</span><span class="mf">0.7</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fpm</span> <span class="o">=</span> <span class="n">fp</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fpm</span><span class="o">.</span><span class="n">freqItemsets</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="go">+---------+----+</span>
<span class="go">| items|freq|</span>
<span class="go">+---------+----+</span>
<span class="go">| [s]| 3|</span>
<span class="go">| [s, x]| 3|</span>
<span class="go">|[s, x, z]| 2|</span>
<span class="go">| [s, z]| 2|</span>
<span class="go">| [r]| 3|</span>
<span class="go">+---------+----+</span>
<span class="go">only showing top 5 rows</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fpm</span><span class="o">.</span><span class="n">associationRules</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
<span class="go">+----------+----------+----------+</span>
<span class="go">|antecedent|consequent|confidence|</span>
<span class="go">+----------+----------+----------+</span>
<span class="go">| [t, s]| [y]| 1.0|</span>
<span class="go">| [t, s]| [x]| 1.0|</span>
<span class="go">| [t, s]| [z]| 1.0|</span>
<span class="go">| [p]| [r]| 1.0|</span>
<span class="go">| [p]| [z]| 1.0|</span>
<span class="go">+----------+----------+----------+</span>
<span class="go">only showing top 5 rows</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">new_data</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([([</span><span class="s2">&quot;t&quot;</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">],</span> <span class="p">)],</span> <span class="p">[</span><span class="s2">&quot;items&quot;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="n">fpm</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">new_data</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span><span class="o">.</span><span class="n">prediction</span><span class="p">)</span>
<span class="go">[&#39;x&#39;, &#39;y&#39;, &#39;z&#39;]</span>
</pre></div>
</div>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.fit">
<code class="descname">fit</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">fitted model(s)</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getItemsCol">
<code class="descname">getItemsCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getItemsCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of itemsCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getMinConfidence">
<code class="descname">getMinConfidence</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getMinConfidence" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minConfidence or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getMinSupport">
<code class="descname">getMinSupport</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getMinSupport" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of minSupport or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getNumPartitions">
<code class="descname">getNumPartitions</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getNumPartitions" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.numPartitions" title="pyspark.ml.fpm.FPGrowth.numPartitions"><code class="xref py py-attr docutils literal"><span class="pre">numPartitions</span></code></a> or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.getPredictionCol">
<code class="descname">getPredictionCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.getPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of predictionCol or its default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.itemsCol">
<code class="descname">itemsCol</code><em class="property"> = Param(parent='undefined', name='itemsCol', doc='items column name')</em><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.itemsCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.minConfidence">
<code class="descname">minConfidence</code><em class="property"> = Param(parent='undefined', name='minConfidence', doc='Minimal confidence for generating Association Rule. [0.0, 1.0]. minConfidence will not affect the mining for frequent itemsets, but will affect the association rules generation.')</em><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.minConfidence" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.minSupport">
<code class="descname">minSupport</code><em class="property"> = Param(parent='undefined', name='minSupport', doc='Minimal support level of the frequent pattern. [0.0, 1.0]. Any pattern that appears more than (minSupport * size-of-the-dataset) times will be output in the frequent itemsets.')</em><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.minSupport" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.numPartitions">
<code class="descname">numPartitions</code><em class="property"> = Param(parent='undefined', name='numPartitions', doc='Number of partitions (at least 1) used by parallel FP-growth. By default the param is not set, and partition number of the input dataset is used.')</em><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.numPartitions" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowth.predictionCol">
<code class="descname">predictionCol</code><em class="property"> = Param(parent='undefined', name='predictionCol', doc='prediction column name.')</em><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.predictionCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setItemsCol">
<code class="descname">setItemsCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setItemsCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.itemsCol" title="pyspark.ml.fpm.FPGrowth.itemsCol"><code class="xref py py-attr docutils literal"><span class="pre">itemsCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setMinConfidence">
<code class="descname">setMinConfidence</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setMinConfidence" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.minConfidence" title="pyspark.ml.fpm.FPGrowth.minConfidence"><code class="xref py py-attr docutils literal"><span class="pre">minConfidence</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setMinSupport">
<code class="descname">setMinSupport</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setMinSupport" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.minSupport" title="pyspark.ml.fpm.FPGrowth.minSupport"><code class="xref py py-attr docutils literal"><span class="pre">minSupport</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setNumPartitions">
<code class="descname">setNumPartitions</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setNumPartitions" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.numPartitions" title="pyspark.ml.fpm.FPGrowth.numPartitions"><code class="xref py py-attr docutils literal"><span class="pre">numPartitions</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setParams">
<code class="descname">setParams</code><span class="sig-paren">(</span><em>self</em>, <em>minSupport=0.3</em>, <em>minConfidence=0.8</em>, <em>itemsCol=&quot;items&quot;</em>, <em>predictionCol=&quot;prediction&quot;</em>, <em>numPartitions=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/fpm.html#FPGrowth.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setParams" title="Permalink to this definition"></a></dt>
<dd><div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.setPredictionCol">
<code class="descname">setPredictionCol</code><span class="sig-paren">(</span><em>value</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.setPredictionCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.fpm.FPGrowth.predictionCol" title="pyspark.ml.fpm.FPGrowth.predictionCol"><code class="xref py py-attr docutils literal"><span class="pre">predictionCol</span></code></a>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowth.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowth.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.fpm.FPGrowthModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.fpm.</code><code class="descname">FPGrowthModel</code><span class="sig-paren">(</span><em>java_model=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/fpm.html#FPGrowthModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel" title="Permalink to this definition"></a></dt>
<dd><div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Experimental</p>
</div>
<p>Model fitted by FPGrowth.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowthModel.associationRules">
<code class="descname">associationRules</code><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.associationRules" title="Permalink to this definition"></a></dt>
<dd><p>Data with three columns:
* <cite>antecedent</cite> - Array of the same type as the input column.
* <cite>consequent</cite> - Array of the same type as the input column.
* <cite>confidence</cite> - Confidence for the rule (<cite>DoubleType</cite>).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.copy">
<code class="descname">copy</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – Extra parameters to copy to the new instance</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">Copy of this instance</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.explainParam">
<code class="descname">explainParam</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.explainParams">
<code class="descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.extractParamMap">
<code class="descname">extractParamMap</code><span class="sig-paren">(</span><em>extra=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>extra</strong> – extra param values</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">merged param map</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowthModel.freqItemsets">
<code class="descname">freqItemsets</code><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.freqItemsets" title="Permalink to this definition"></a></dt>
<dd><p>DataFrame with two columns:
* <cite>items</cite> - Itemset of the same type as the input column.
* <cite>freq</cite> - Frequency of the itemset (<cite>LongType</cite>).</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.2.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.getOrDefault">
<code class="descname">getOrDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.getParam">
<code class="descname">getParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.hasDefault">
<code class="descname">hasDefault</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.hasParam">
<code class="descname">hasParam</code><span class="sig-paren">(</span><em>paramName</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.isDefined">
<code class="descname">isDefined</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.isSet">
<code class="descname">isSet</code><span class="sig-paren">(</span><em>param</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="attribute">
<dt id="pyspark.ml.fpm.FPGrowthModel.params">
<code class="descname">params</code><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.read">
<code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.transform">
<code class="descname">transform</code><span class="sig-paren">(</span><em>dataset</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.transform" title="Permalink to this definition"></a></dt>
<dd><p>Transforms the input dataset with optional parameters.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>dataset</strong> – input dataset, which is an instance of <a class="reference internal" href="pyspark.sql.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal"><span class="pre">pyspark.sql.DataFrame</span></code></a></li>
<li><strong>params</strong> – an optional param map that overrides embedded params.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">transformed dataset</p>
</td>
</tr>
</tbody>
</table>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.fpm.FPGrowthModel.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.fpm.FPGrowthModel.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
</div>
<div class="section" id="module-pyspark.ml.util">
<span id="pyspark-ml-util-module"></span><h2>pyspark.ml.util module<a class="headerlink" href="#module-pyspark.ml.util" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="pyspark.ml.util.Identifiable">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">Identifiable</code><a class="reference internal" href="_modules/pyspark/ml/util.html#Identifiable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.Identifiable" title="Permalink to this definition"></a></dt>
<dd><p>Object with a unique ID.</p>
<dl class="attribute">
<dt id="pyspark.ml.util.Identifiable.uid">
<code class="descname">uid</code><em class="property"> = None</em><a class="headerlink" href="#pyspark.ml.util.Identifiable.uid" title="Permalink to this definition"></a></dt>
<dd><p>A unique id for the object.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.JavaMLReadable">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">JavaMLReadable</code><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReadable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReadable" title="Permalink to this definition"></a></dt>
<dd><p>(Private) Mixin for instances that provide JavaMLReader.</p>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLReadable.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.util.JavaMLReadable.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.util.JavaMLReadable.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReadable.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReadable.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.JavaMLReader">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">JavaMLReader</code><span class="sig-paren">(</span><em>clazz</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReader"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReader" title="Permalink to this definition"></a></dt>
<dd><p>(Private) Specialization of <a class="reference internal" href="#pyspark.ml.util.MLReader" title="pyspark.ml.util.MLReader"><code class="xref py py-class docutils literal"><span class="pre">MLReader</span></code></a> for <code class="xref py py-class docutils literal"><span class="pre">JavaParams</span></code> types</p>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLReader.context">
<code class="descname">context</code><span class="sig-paren">(</span><em>sqlContext</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReader.context"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReader.context" title="Permalink to this definition"></a></dt>
<dd><p>Sets the SQL context to use for loading.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.1 and will be removed in 3.0, use session instead.</p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLReader.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReader.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReader.load" title="Permalink to this definition"></a></dt>
<dd><p>Load the ML instance from the input path.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLReader.session">
<code class="descname">session</code><span class="sig-paren">(</span><em>sparkSession</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLReader.session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLReader.session" title="Permalink to this definition"></a></dt>
<dd><p>Sets the Spark Session to use for loading.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.JavaMLWritable">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">JavaMLWritable</code><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWritable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWritable" title="Permalink to this definition"></a></dt>
<dd><p>(Private) Mixin for ML instances that provide <a class="reference internal" href="#pyspark.ml.util.JavaMLWriter" title="pyspark.ml.util.JavaMLWriter"><code class="xref py py-class docutils literal"><span class="pre">JavaMLWriter</span></code></a>.</p>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWritable.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.util.JavaMLWritable.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWritable.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWritable.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWritable.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.JavaMLWriter">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">JavaMLWriter</code><span class="sig-paren">(</span><em>instance</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWriter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWriter" title="Permalink to this definition"></a></dt>
<dd><p>(Private) Specialization of <a class="reference internal" href="#pyspark.ml.util.MLWriter" title="pyspark.ml.util.MLWriter"><code class="xref py py-class docutils literal"><span class="pre">MLWriter</span></code></a> for <code class="xref py py-class docutils literal"><span class="pre">JavaParams</span></code> types</p>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWriter.context">
<code class="descname">context</code><span class="sig-paren">(</span><em>sqlContext</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWriter.context"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWriter.context" title="Permalink to this definition"></a></dt>
<dd><p>Sets the SQL context to use for saving.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.1 and will be removed in 3.0, use session instead.</p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWriter.overwrite">
<code class="descname">overwrite</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWriter.overwrite"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWriter.overwrite" title="Permalink to this definition"></a></dt>
<dd><p>Overwrites if the output path already exists.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWriter.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWriter.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWriter.save" title="Permalink to this definition"></a></dt>
<dd><p>Save the ML instance to the input path.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.JavaMLWriter.session">
<code class="descname">session</code><span class="sig-paren">(</span><em>sparkSession</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaMLWriter.session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaMLWriter.session" title="Permalink to this definition"></a></dt>
<dd><p>Sets the Spark Session to use for saving.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.JavaPredictionModel">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">JavaPredictionModel</code><a class="reference internal" href="_modules/pyspark/ml/util.html#JavaPredictionModel"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.JavaPredictionModel" title="Permalink to this definition"></a></dt>
<dd><p>(Private) Java Model for prediction tasks (regression and classification).
To be mixed in with class:<cite>pyspark.ml.JavaModel</cite></p>
<dl class="attribute">
<dt id="pyspark.ml.util.JavaPredictionModel.numFeatures">
<code class="descname">numFeatures</code><a class="headerlink" href="#pyspark.ml.util.JavaPredictionModel.numFeatures" title="Permalink to this definition"></a></dt>
<dd><p>Returns the number of features the model was trained on. If unknown, returns -1</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.1.0.</span></p>
</div>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.MLReadable">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">MLReadable</code><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReadable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReadable" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for instances that provide <a class="reference internal" href="#pyspark.ml.util.MLReader" title="pyspark.ml.util.MLReader"><code class="xref py py-class docutils literal"><span class="pre">MLReader</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="classmethod">
<dt id="pyspark.ml.util.MLReadable.load">
<em class="property">classmethod </em><code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReadable.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReadable.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="classmethod">
<dt id="pyspark.ml.util.MLReadable.read">
<em class="property">classmethod </em><code class="descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReadable.read"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReadable.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.MLReader">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">MLReader</code><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReader"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReader" title="Permalink to this definition"></a></dt>
<dd><p>Utility class that can load ML instances.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.util.MLReader.context">
<code class="descname">context</code><span class="sig-paren">(</span><em>sqlContext</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReader.context"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReader.context" title="Permalink to this definition"></a></dt>
<dd><p>Sets the SQL context to use for loading.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.1 and will be removed in 3.0, use session instead.</p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLReader.load">
<code class="descname">load</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReader.load"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReader.load" title="Permalink to this definition"></a></dt>
<dd><p>Load the ML instance from the input path.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLReader.session">
<code class="descname">session</code><span class="sig-paren">(</span><em>sparkSession</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLReader.session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLReader.session" title="Permalink to this definition"></a></dt>
<dd><p>Sets the Spark Session to use for loading.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.MLWritable">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">MLWritable</code><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWritable"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWritable" title="Permalink to this definition"></a></dt>
<dd><p>Mixin for ML instances that provide <a class="reference internal" href="#pyspark.ml.util.MLWriter" title="pyspark.ml.util.MLWriter"><code class="xref py py-class docutils literal"><span class="pre">MLWriter</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.util.MLWritable.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWritable.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWritable.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of <cite>write().save(path)</cite>.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLWritable.write">
<code class="descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWritable.write"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWritable.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="pyspark.ml.util.MLWriter">
<em class="property">class </em><code class="descclassname">pyspark.ml.util.</code><code class="descname">MLWriter</code><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWriter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWriter" title="Permalink to this definition"></a></dt>
<dd><p>Utility class that can save ML instances.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 2.0.0.</span></p>
</div>
<dl class="method">
<dt id="pyspark.ml.util.MLWriter.context">
<code class="descname">context</code><span class="sig-paren">(</span><em>sqlContext</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWriter.context"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWriter.context" title="Permalink to this definition"></a></dt>
<dd><p>Sets the SQL context to use for saving.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Deprecated in 2.1 and will be removed in 3.0, use session instead.</p>
</div>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLWriter.overwrite">
<code class="descname">overwrite</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWriter.overwrite"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWriter.overwrite" title="Permalink to this definition"></a></dt>
<dd><p>Overwrites if the output path already exists.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLWriter.save">
<code class="descname">save</code><span class="sig-paren">(</span><em>path</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWriter.save"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWriter.save" title="Permalink to this definition"></a></dt>
<dd><p>Save the ML instance to the input path.</p>
</dd></dl>
<dl class="method">
<dt id="pyspark.ml.util.MLWriter.session">
<code class="descname">session</code><span class="sig-paren">(</span><em>sparkSession</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/pyspark/ml/util.html#MLWriter.session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.util.MLWriter.session" title="Permalink to this definition"></a></dt>
<dd><p>Sets the Spark Session to use for saving.</p>
</dd></dl>
</dd></dl>
</div>
</div>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo"><a href="index.html">
<img class="logo" src="_static/spark-logo-hd.png" alt="Logo"/>
</a></p>
<h3><a href="index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">pyspark.ml package</a><ul>
<li><a class="reference internal" href="#module-pyspark.ml">ML Pipeline APIs</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.param">pyspark.ml.param module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.feature">pyspark.ml.feature module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.classification">pyspark.ml.classification module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.clustering">pyspark.ml.clustering module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.linalg">pyspark.ml.linalg module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.recommendation">pyspark.ml.recommendation module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.regression">pyspark.ml.regression module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.stat">pyspark.ml.stat module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.tuning">pyspark.ml.tuning module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.evaluation">pyspark.ml.evaluation module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.fpm">pyspark.ml.fpm module</a></li>
<li><a class="reference internal" href="#module-pyspark.ml.util">pyspark.ml.util module</a></li>
</ul>
</li>
</ul>
<h4>Previous topic</h4>
<p class="topless"><a href="pyspark.streaming.html"
title="previous chapter">pyspark.streaming module</a></p>
<h4>Next topic</h4>
<p class="topless"><a href="pyspark.mllib.html"
title="next chapter">pyspark.mllib package</a></p>
<div role="note" aria-label="source link">
<h3>This Page</h3>
<ul class="this-page-menu">
<li><a href="_sources/pyspark.ml.rst.txt"
rel="nofollow">Show Source</a></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3>Quick search</h3>
<form class="search" action="search.html" method="get">
<div><input type="text" name="q" /></div>
<div><input type="submit" value="Go" /></div>
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="pyspark.mllib.html" title="pyspark.mllib package"
>next</a></li>
<li class="right" >
<a href="pyspark.streaming.html" title="pyspark.streaming module"
>previous</a> |</li>
<li class="nav-item nav-item-0"><a href="index.html">PySpark 2.2.1 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="pyspark.html" >pyspark package</a> &#187;</li>
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright .
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.6.5.
</div>
</body>
</html>