blob: 5190f300118133da92ac3e6b7c3ccaf5e7ae9118 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.ml.feature &#8212; PySpark 3.2.3 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.ml.feature</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">since</span><span class="p">,</span> <span class="n">keyword_only</span><span class="p">,</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">_convert_to_vector</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="n">HasThreshold</span><span class="p">,</span> <span class="n">HasThresholds</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> \
<span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">,</span> <span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">,</span> \
<span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasNumFeatures</span><span class="p">,</span> <span class="n">HasStepSize</span><span class="p">,</span> <span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">TypeConverters</span><span class="p">,</span> <span class="n">Param</span><span class="p">,</span> <span class="n">Params</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaParams</span><span class="p">,</span> <span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">_jvm</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Binarizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;BucketedRandomProjectionLSH&#39;</span><span class="p">,</span> <span class="s1">&#39;BucketedRandomProjectionLSHModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;Bucketizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;ChiSqSelector&#39;</span><span class="p">,</span> <span class="s1">&#39;ChiSqSelectorModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;CountVectorizer&#39;</span><span class="p">,</span> <span class="s1">&#39;CountVectorizerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;DCT&#39;</span><span class="p">,</span>
<span class="s1">&#39;ElementwiseProduct&#39;</span><span class="p">,</span>
<span class="s1">&#39;FeatureHasher&#39;</span><span class="p">,</span>
<span class="s1">&#39;HashingTF&#39;</span><span class="p">,</span>
<span class="s1">&#39;IDF&#39;</span><span class="p">,</span> <span class="s1">&#39;IDFModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;Imputer&#39;</span><span class="p">,</span> <span class="s1">&#39;ImputerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;IndexToString&#39;</span><span class="p">,</span>
<span class="s1">&#39;Interaction&#39;</span><span class="p">,</span>
<span class="s1">&#39;MaxAbsScaler&#39;</span><span class="p">,</span> <span class="s1">&#39;MaxAbsScalerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;MinHashLSH&#39;</span><span class="p">,</span> <span class="s1">&#39;MinHashLSHModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;MinMaxScaler&#39;</span><span class="p">,</span> <span class="s1">&#39;MinMaxScalerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;NGram&#39;</span><span class="p">,</span>
<span class="s1">&#39;Normalizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;OneHotEncoder&#39;</span><span class="p">,</span> <span class="s1">&#39;OneHotEncoderModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;PCA&#39;</span><span class="p">,</span> <span class="s1">&#39;PCAModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;PolynomialExpansion&#39;</span><span class="p">,</span>
<span class="s1">&#39;QuantileDiscretizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;RobustScaler&#39;</span><span class="p">,</span> <span class="s1">&#39;RobustScalerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;RegexTokenizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;RFormula&#39;</span><span class="p">,</span> <span class="s1">&#39;RFormulaModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;SQLTransformer&#39;</span><span class="p">,</span>
<span class="s1">&#39;StandardScaler&#39;</span><span class="p">,</span> <span class="s1">&#39;StandardScalerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;StopWordsRemover&#39;</span><span class="p">,</span>
<span class="s1">&#39;StringIndexer&#39;</span><span class="p">,</span> <span class="s1">&#39;StringIndexerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;Tokenizer&#39;</span><span class="p">,</span>
<span class="s1">&#39;UnivariateFeatureSelector&#39;</span><span class="p">,</span> <span class="s1">&#39;UnivariateFeatureSelectorModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;VarianceThresholdSelector&#39;</span><span class="p">,</span> <span class="s1">&#39;VarianceThresholdSelectorModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;VectorAssembler&#39;</span><span class="p">,</span>
<span class="s1">&#39;VectorIndexer&#39;</span><span class="p">,</span> <span class="s1">&#39;VectorIndexerModel&#39;</span><span class="p">,</span>
<span class="s1">&#39;VectorSizeHint&#39;</span><span class="p">,</span>
<span class="s1">&#39;VectorSlicer&#39;</span><span class="p">,</span>
<span class="s1">&#39;Word2Vec&#39;</span><span class="p">,</span> <span class="s1">&#39;Word2VecModel&#39;</span><span class="p">]</span>
<div class="viewcode-block" id="Binarizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Binarizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasThreshold</span><span class="p">,</span> <span class="n">HasThresholds</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Binarize a column of continuous features given a threshold. Since 3.0.0,</span>
<span class="sd"> :py:class:`Binarize` can map multiple columns at once by setting the :py:attr:`inputCols`</span>
<span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span>
<span class="sd"> are set, an Exception will be thrown. The :py:attr:`threshold` parameter is used for</span>
<span class="sd"> single column usage, and :py:attr:`thresholds` is for multiple columns.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.5,)], [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; binarizer = Binarizer(threshold=1.0, inputCol=&quot;values&quot;, outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setThreshold(1.0)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setInputCol(&quot;values&quot;)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer.transform(df).head().features</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; binarizer.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; params = {binarizer.threshold: -0.5, binarizer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; binarizer.transform(df, params).head().vector</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; binarizerPath = temp_path + &quot;/binarizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; binarizer.save(binarizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer = Binarizer.load(binarizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer.getThreshold() == binarizer.getThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(0.5, 0.3)], [&quot;values1&quot;, &quot;values2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; binarizer2 = Binarizer(thresholds=[0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; binarizer2.setInputCols([&quot;values1&quot;, &quot;values2&quot;]).setOutputCols([&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> Binarizer...</span>
<span class="sd"> &gt;&gt;&gt; binarizer2.transform(df2).show()</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> |values1|values2|output1|output2|</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> | 0.5| 0.3| 1.0| 0.0|</span>
<span class="sd"> +-------+-------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">threshold</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;threshold&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for threshold used to binarize continuous features. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;The features greater than the threshold will be binarized to 1.0. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;The features equal to or less than the threshold will be binarized to 0.0&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">thresholds</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;thresholds&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for array of threshold used to binarize continuous features. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;This is for multiple columns input. If transforming multiple columns &quot;</span> <span class="o">+</span>
<span class="s2">&quot;and thresholds is not set, but threshold is set, then threshold will &quot;</span> <span class="o">+</span>
<span class="s2">&quot;be applied across all columns.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">thresholds</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span>
<span class="sd"> inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Binarizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Binarizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Binarizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">thresholds</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span>
<span class="sd"> inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this Binarizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`threshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setThresholds"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThresholds">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setThresholds</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`thresholds`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">thresholds</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Binarizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_LSHParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH) algorithm parameters.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">numHashTables</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;numHashTables&quot;</span><span class="p">,</span> <span class="s2">&quot;number of hash tables, where &quot;</span> <span class="o">+</span>
<span class="s2">&quot;increasing number of hash tables lowers the false negative rate, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;and decreasing it improves the running performance.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_LSHParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numHashTables or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numHashTables</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_LSH</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_LSHParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">setNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numHashTables`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_LSHModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_LSHParams</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Locality Sensitive Hashing (LSH) models.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">approxNearestNeighbors</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">numNearestNeighbors</span><span class="p">,</span> <span class="n">distCol</span><span class="o">=</span><span class="s2">&quot;distCol&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Given a large dataset and an item, approximately find at most k items which have the</span>
<span class="sd"> closest distance to the item. If the :py:attr:`outputCol` is missing, the method will</span>
<span class="sd"> transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows</span>
<span class="sd"> caching of the transformed data when necessary.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method is experimental and will likely change behavior in the next release.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dataset : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> The dataset to search for nearest neighbors of the key.</span>
<span class="sd"> key : :py:class:`pyspark.ml.linalg.Vector`</span>
<span class="sd"> Feature vector representing the item to search for.</span>
<span class="sd"> numNearestNeighbors : int</span>
<span class="sd"> The maximum number of nearest neighbors.</span>
<span class="sd"> distCol : str</span>
<span class="sd"> Output column for storing the distance between each result row and the key.</span>
<span class="sd"> Use &quot;distCol&quot; as default value if it&#39;s not specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A dataset containing at most k items closest to the key. A column &quot;distCol&quot; is</span>
<span class="sd"> added to show the distance between each row and the key.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;approxNearestNeighbors&quot;</span><span class="p">,</span> <span class="n">dataset</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">numNearestNeighbors</span><span class="p">,</span>
<span class="n">distCol</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">approxSimilarityJoin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasetA</span><span class="p">,</span> <span class="n">datasetB</span><span class="p">,</span> <span class="n">threshold</span><span class="p">,</span> <span class="n">distCol</span><span class="o">=</span><span class="s2">&quot;distCol&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Join two datasets to approximately find all pairs of rows whose distance are smaller than</span>
<span class="sd"> the threshold. If the :py:attr:`outputCol` is missing, the method will transform the data;</span>
<span class="sd"> if the :py:attr:`outputCol` exists, it will use that. This allows caching of the</span>
<span class="sd"> transformed data when necessary.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> datasetA : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> One of the datasets to join.</span>
<span class="sd"> datasetB : :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> Another dataset to join.</span>
<span class="sd"> threshold : float</span>
<span class="sd"> The threshold for the distance of row pairs.</span>
<span class="sd"> distCol : str, optional</span>
<span class="sd"> Output column for storing the distance between each pair of rows. Use</span>
<span class="sd"> &quot;distCol&quot; as default value if it&#39;s not specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`pyspark.sql.DataFrame`</span>
<span class="sd"> A joined dataset containing pairs of rows. The original rows are in columns</span>
<span class="sd"> &quot;datasetA&quot; and &quot;datasetB&quot;, and a column &quot;distCol&quot; is added to show the distance</span>
<span class="sd"> between each pair.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">threshold</span> <span class="o">=</span> <span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">(</span><span class="n">threshold</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;approxSimilarityJoin&quot;</span><span class="p">,</span> <span class="n">datasetA</span><span class="p">,</span> <span class="n">datasetB</span><span class="p">,</span> <span class="n">threshold</span><span class="p">,</span> <span class="n">distCol</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_BucketedRandomProjectionLSHParams</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`BucketedRandomProjectionLSH` and</span>
<span class="sd"> :py:class:`BucketedRandomProjectionLSHModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">bucketLength</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;bucketLength&quot;</span><span class="p">,</span> <span class="s2">&quot;the length of each hash bucket, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;a larger bucket lowers the false negative rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of bucketLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bucketLength</span><span class="p">)</span>
<div class="viewcode-block" id="BucketedRandomProjectionLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">BucketedRandomProjectionLSH</span><span class="p">(</span><span class="n">_LSH</span><span class="p">,</span> <span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span>
<span class="n">HasSeed</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> LSH class for Euclidean distance metrics.</span>
<span class="sd"> The input is dense or sparse vectors, each of which represents a point in the Euclidean</span>
<span class="sd"> distance space. The output will be vectors of configurable dimension. Hash values in the same</span>
<span class="sd"> dimension are calculated by the same hash function.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - `Stable Distributions in Wikipedia article on Locality-sensitive hashing \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions&gt;`_</span>
<span class="sd"> - `Hashing for Similarity Search: A Survey &lt;https://arxiv.org/abs/1408.2927&gt;`_</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.dense([-1.0, -1.0 ]),),</span>
<span class="sd"> ... (1, Vectors.dense([-1.0, 1.0 ]),),</span>
<span class="sd"> ... (2, Vectors.dense([1.0, -1.0 ]),),</span>
<span class="sd"> ... (3, Vectors.dense([1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; brp = BucketedRandomProjectionLSH()</span>
<span class="sd"> &gt;&gt;&gt; brp.setInputCol(&quot;features&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setSeed(12345)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; brp.setBucketLength(1.0)</span>
<span class="sd"> BucketedRandomProjectionLSH...</span>
<span class="sd"> &gt;&gt;&gt; model = brp.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getBucketLength()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> BucketedRandomProjectionLSHModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head()</span>
<span class="sd"> Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])])</span>
<span class="sd"> &gt;&gt;&gt; data2 = [(4, Vectors.dense([2.0, 2.0 ]),),</span>
<span class="sd"> ... (5, Vectors.dense([2.0, 3.0 ]),),</span>
<span class="sd"> ... (6, Vectors.dense([3.0, 2.0 ]),),</span>
<span class="sd"> ... (7, Vectors.dense([3.0, 3.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(data2, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()</span>
<span class="sd"> [Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 3.0, distCol=&quot;EuclideanDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;EuclideanDistance&quot;)).show()</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> |idA|idB|EuclideanDistance|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> | 3| 6| 2.23606797749979|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 3, distCol=&quot;EuclideanDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;EuclideanDistance&quot;)).show()</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> |idA|idB|EuclideanDistance|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> | 3| 6| 2.23606797749979|</span>
<span class="sd"> +---+---+-----------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; brpPath = temp_path + &quot;/brp&quot;</span>
<span class="sd"> &gt;&gt;&gt; brp.save(brpPath)</span>
<span class="sd"> &gt;&gt;&gt; brp2 = BucketedRandomProjectionLSH.load(brpPath)</span>
<span class="sd"> &gt;&gt;&gt; brp2.getBucketLength() == brp.getBucketLength()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/brp-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model2 = BucketedRandomProjectionLSHModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().hashes == model2.transform(df).head().hashes</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">bucketLength</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span>
<span class="sd"> bucketLength=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">BucketedRandomProjectionLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> \
<span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.BucketedRandomProjectionLSH&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">bucketLength</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span>
<span class="sd"> bucketLength=None)</span>
<span class="sd"> Sets params for this BucketedRandomProjectionLSH.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setBucketLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setBucketLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`bucketLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">bucketLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">BucketedRandomProjectionLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="BucketedRandomProjectionLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSHModel.html#pyspark.ml.feature.BucketedRandomProjectionLSHModel">[docs]</a><span class="k">class</span> <span class="nc">BucketedRandomProjectionLSHModel</span><span class="p">(</span><span class="n">_LSHModel</span><span class="p">,</span> <span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are</span>
<span class="sd"> stored. The vectors are normalized to be unit vectors and each vector is used in a hash</span>
<span class="sd"> function: :math:`h_i(x) = floor(r_i \cdot x / bucketLength)` where :math:`r_i` is the</span>
<span class="sd"> i-th random unit vector. The number of buckets will be `(max L2 norm of input vectors) /</span>
<span class="sd"> bucketLength`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="Bucketizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Bucketizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps a column of continuous features to a column of feature buckets. Since 3.0.0,</span>
<span class="sd"> :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`</span>
<span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span>
<span class="sd"> are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single</span>
<span class="sd"> column usage, and :py:attr:`splitsArray` is for multiple columns.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float(&quot;nan&quot;)),</span>
<span class="sd"> ... (float(&quot;nan&quot;), 1.0), (float(&quot;nan&quot;), 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(values, [&quot;values1&quot;, &quot;values2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; bucketizer = Bucketizer()</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setSplits([-float(&quot;inf&quot;), 0.5, 1.4, float(&quot;inf&quot;)])</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setInputCol(&quot;values1&quot;)</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setOutputCol(&quot;buckets&quot;)</span>
<span class="sd"> Bucketizer...</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;keep&quot;).transform(df).collect()</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;keep&quot;).transform(df.select(&quot;values1&quot;))</span>
<span class="sd"> &gt;&gt;&gt; bucketed.show(truncate=False)</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> |values1|buckets|</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> |0.1 |0.0 |</span>
<span class="sd"> |0.4 |0.0 |</span>
<span class="sd"> |1.2 |1.0 |</span>
<span class="sd"> |1.5 |2.0 |</span>
<span class="sd"> |NaN |3.0 |</span>
<span class="sd"> |NaN |3.0 |</span>
<span class="sd"> +-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.setParams(outputCol=&quot;b&quot;).transform(df).head().b</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; bucketizerPath = temp_path + &quot;/bucketizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; bucketizer.save(bucketizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer = Bucketizer.load(bucketizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer.getSplits() == bucketizer.getSplits()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.setHandleInvalid(&quot;skip&quot;).transform(df).collect()</span>
<span class="sd"> &gt;&gt;&gt; len(bucketed)</span>
<span class="sd"> 4</span>
<span class="sd"> &gt;&gt;&gt; bucketizer2 = Bucketizer(splitsArray=</span>
<span class="sd"> ... [[-float(&quot;inf&quot;), 0.5, 1.4, float(&quot;inf&quot;)], [-float(&quot;inf&quot;), 0.5, float(&quot;inf&quot;)]],</span>
<span class="sd"> ... inputCols=[&quot;values1&quot;, &quot;values2&quot;], outputCols=[&quot;buckets1&quot;, &quot;buckets2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; bucketed2 = bucketizer2.setHandleInvalid(&quot;keep&quot;).transform(df)</span>
<span class="sd"> &gt;&gt;&gt; bucketed2.show(truncate=False)</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> |values1|values2|buckets1|buckets2|</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> |0.1 |0.0 |0.0 |0.0 |</span>
<span class="sd"> |0.4 |1.0 |0.0 |1.0 |</span>
<span class="sd"> |1.2 |1.3 |1.0 |1.0 |</span>
<span class="sd"> |1.5 |NaN |2.0 |2.0 |</span>
<span class="sd"> |NaN |1.0 |3.0 |1.0 |</span>
<span class="sd"> |NaN |0.0 |3.0 |0.0 |</span>
<span class="sd"> +-------+-------+--------+--------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">splits</span> <span class="o">=</span> \
<span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;splits&quot;</span><span class="p">,</span>
<span class="s2">&quot;Split points for mapping continuous features into buckets. With n+1 splits, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;there are n buckets. A bucket defined by splits x,y holds values in the &quot;</span> <span class="o">+</span>
<span class="s2">&quot;range [x,y) except the last bucket, which also includes y. The splits &quot;</span> <span class="o">+</span>
<span class="s2">&quot;should be of length &gt;= 3 and strictly increasing. Values at -inf, inf must be &quot;</span> <span class="o">+</span>
<span class="s2">&quot;explicitly provided to cover all Double values; otherwise, values outside the &quot;</span> <span class="o">+</span>
<span class="s2">&quot;splits specified will be treated as errors.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;how to handle invalid entries &quot;</span>
<span class="s2">&quot;containing NaN values. Values outside the splits will always be treated &quot;</span>
<span class="s2">&quot;as errors. Options are &#39;skip&#39; (filter out rows with invalid values), &quot;</span> <span class="o">+</span>
<span class="s2">&quot;&#39;error&#39; (throw an error), or &#39;keep&#39; (keep invalid values in a &quot;</span> <span class="o">+</span>
<span class="s2">&quot;special additional bucket). Note that in the multiple column &quot;</span> <span class="o">+</span>
<span class="s2">&quot;case, the invalid handling is applied to all columns. That said &quot;</span> <span class="o">+</span>
<span class="s2">&quot;for &#39;error&#39; it will throw an error if any invalids are found in &quot;</span> <span class="o">+</span>
<span class="s2">&quot;any column, for &#39;skip&#39; it will skip rows with any invalids in &quot;</span> <span class="o">+</span>
<span class="s2">&quot;any columns, etc.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">splitsArray</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;splitsArray&quot;</span><span class="p">,</span> <span class="s2">&quot;The array of split points for mapping &quot;</span> <span class="o">+</span>
<span class="s2">&quot;continuous features into buckets for multiple columns. For each input &quot;</span> <span class="o">+</span>
<span class="s2">&quot;column, with n+1 splits, there are n buckets. A bucket defined by &quot;</span> <span class="o">+</span>
<span class="s2">&quot;splits x,y holds values in the range [x,y) except the last bucket, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;which also includes y. The splits should be of length &gt;= 3 and &quot;</span> <span class="o">+</span>
<span class="s2">&quot;strictly increasing. Values at -inf, inf must be explicitly provided &quot;</span> <span class="o">+</span>
<span class="s2">&quot;to cover all Double values; otherwise, values outside the splits &quot;</span> <span class="o">+</span>
<span class="s2">&quot;specified will be treated as errors.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListListFloat</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">splits</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Bucketizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Bucketizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Bucketizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">splits</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">splitsArray</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this Bucketizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`splits`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.getSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of threshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splits</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`splitsArray`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splitsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.getSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the array of split points or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splitsArray</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Bucketizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_CountVectorizerParams</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`CountVectorizer` and :py:class:`CountVectorizerModel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">minTF</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;minTF&quot;</span><span class="p">,</span> <span class="s2">&quot;Filter to ignore rare words in&quot;</span> <span class="o">+</span>
<span class="s2">&quot; a document. For each document, terms with frequency/count less than the given&quot;</span> <span class="o">+</span>
<span class="s2">&quot; threshold are ignored. If this is an integer &gt;= 1, then this specifies a count (of&quot;</span> <span class="o">+</span>
<span class="s2">&quot; times the term must appear in the document); if this is a double in [0,1), then this &quot;</span> <span class="o">+</span>
<span class="s2">&quot;specifies a fraction (out of the document&#39;s token count). Note that the parameter is &quot;</span> <span class="o">+</span>
<span class="s2">&quot;only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">minDF</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;minDF&quot;</span><span class="p">,</span> <span class="s2">&quot;Specifies the minimum number of&quot;</span> <span class="o">+</span>
<span class="s2">&quot; different documents a term must appear in to be included in the vocabulary.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; If this is an integer &gt;= 1, this specifies the number of documents the term must&quot;</span> <span class="o">+</span>
<span class="s2">&quot; appear in; if this is a double in [0,1), then this specifies the fraction of documents.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; Default 1.0&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">maxDF</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;maxDF&quot;</span><span class="p">,</span> <span class="s2">&quot;Specifies the maximum number of&quot;</span> <span class="o">+</span>
<span class="s2">&quot; different documents a term could appear in to be included in the vocabulary.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; A term that appears more than the threshold will be ignored. If this is an&quot;</span> <span class="o">+</span>
<span class="s2">&quot; integer &gt;= 1, this specifies the maximum number of documents the term could appear in;&quot;</span> <span class="o">+</span>
<span class="s2">&quot; if this is a double in [0,1), then this specifies the maximum&quot;</span> <span class="o">+</span>
<span class="s2">&quot; fraction of documents the term could appear in.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; Default (2^63) - 1&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">vocabSize</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;vocabSize&quot;</span><span class="p">,</span> <span class="s2">&quot;max size of the vocabulary. Default 1 &lt;&lt; 18.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">binary</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;binary&quot;</span><span class="p">,</span> <span class="s2">&quot;Binary toggle to control the output vector values.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful&quot;</span> <span class="o">+</span>
<span class="s2">&quot; for discrete probabilistic models that model binary events rather than integer counts.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; Default False&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">maxDF</span><span class="o">=</span><span class="mi">2</span> <span class="o">**</span> <span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minTF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minDF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxDF or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxDF</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of vocabSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">CountVectorizer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(0, [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]), (1, [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;c&quot;, &quot;a&quot;])],</span>
<span class="sd"> ... [&quot;label&quot;, &quot;raw&quot;])</span>
<span class="sd"> &gt;&gt;&gt; cv = CountVectorizer()</span>
<span class="sd"> &gt;&gt;&gt; cv.setInputCol(&quot;raw&quot;)</span>
<span class="sd"> CountVectorizer...</span>
<span class="sd"> &gt;&gt;&gt; cv.setOutputCol(&quot;vectors&quot;)</span>
<span class="sd"> CountVectorizer...</span>
<span class="sd"> &gt;&gt;&gt; model = cv.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;raw&quot;)</span>
<span class="sd"> CountVectorizerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show(truncate=False)</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |label|raw |vectors |</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span>
<span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; sorted(model.vocabulary) == [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; countVectorizerPath = temp_path + &quot;/count-vectorizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; cv.save(countVectorizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedCv = CountVectorizer.load(countVectorizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getMinDF() == cv.getMinDF()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getMinTF() == cv.getMinTF()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedCv.getVocabSize() == cv.getVocabSize()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/count-vectorizer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = CountVectorizerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.vocabulary == model.vocabulary</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; fromVocabModel = CountVectorizerModel.from_vocabulary([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],</span>
<span class="sd"> ... inputCol=&quot;raw&quot;, outputCol=&quot;vectors&quot;)</span>
<span class="sd"> &gt;&gt;&gt; fromVocabModel.transform(df).show(truncate=False)</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |label|raw |vectors |</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span>
<span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span>
<span class="sd"> +-----+---------------+-------------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minTF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">maxDF</span><span class="o">=</span><span class="mi">2</span> <span class="o">**</span> <span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 &lt;&lt; 18,\</span>
<span class="sd"> binary=False, inputCol=None,outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">CountVectorizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.CountVectorizer&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minTF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">maxDF</span><span class="o">=</span><span class="mi">2</span> <span class="o">**</span> <span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span>
<span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 &lt;&lt; 18,\</span>
<span class="sd"> binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Set the params for the CountVectorizer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMinDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minDF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setMaxDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMaxDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxDF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setVocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setVocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`vocabSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">CountVectorizerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">CountVectorizerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`CountVectorizer`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="CountVectorizerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.from_vocabulary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.from_vocabulary">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_vocabulary</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">vocabulary</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">minTF</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from a vocabulary list of strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jvocab</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.CountVectorizerModel&quot;</span><span class="p">,</span> <span class="n">jvocab</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">minTF</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setMinTF</span><span class="p">(</span><span class="n">minTF</span><span class="p">)</span>
<span class="k">if</span> <span class="n">binary</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setBinary</span><span class="p">(</span><span class="n">binary</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">))</span>
<span class="k">return</span> <span class="n">model</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">vocabulary</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> An array of terms in the vocabulary.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;vocabulary&quot;</span><span class="p">)</span>
<div class="viewcode-block" id="CountVectorizerModel.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTF`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="CountVectorizerModel.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="DCT"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">DCT</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that takes the 1D discrete cosine transform</span>
<span class="sd"> of a real vector. No zero padding is performed on the input vector.</span>
<span class="sd"> It returns a real vector of the same length representing the DCT.</span>
<span class="sd"> The return vector is scaled such that the transform matrix is</span>
<span class="sd"> unitary (aka scaled DCT-II).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `More information on Wikipedia \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia&gt;`_.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], [&quot;vec&quot;])</span>
<span class="sd"> &gt;&gt;&gt; dct = DCT( )</span>
<span class="sd"> &gt;&gt;&gt; dct.setInverse(False)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; dct.setInputCol(&quot;vec&quot;)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; dct.setOutputCol(&quot;resultVec&quot;)</span>
<span class="sd"> DCT...</span>
<span class="sd"> &gt;&gt;&gt; df2 = dct.transform(df1)</span>
<span class="sd"> &gt;&gt;&gt; df2.head().resultVec</span>
<span class="sd"> DenseVector([10.969..., -0.707..., -2.041...])</span>
<span class="sd"> &gt;&gt;&gt; df3 = DCT(inverse=True, inputCol=&quot;resultVec&quot;, outputCol=&quot;origVec&quot;).transform(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.head().origVec</span>
<span class="sd"> DenseVector([5.0, 8.0, 6.0])</span>
<span class="sd"> &gt;&gt;&gt; dctPath = temp_path + &quot;/dct&quot;</span>
<span class="sd"> &gt;&gt;&gt; dct.save(dctPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc = DCT.load(dctPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedDtc.getInverse()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">inverse</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;inverse&quot;</span><span class="p">,</span> <span class="s2">&quot;Set transformer to perform inverse DCT, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;default False.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inverse=False, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">DCT</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.DCT&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="DCT.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inverse=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this DCT.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inverse`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.getInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.getInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of inverse or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inverse</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="DCT.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="ElementwiseProduct"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">ElementwiseProduct</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Outputs the Hadamard product (i.e., the element-wise product) of each input vector</span>
<span class="sd"> with a provided &quot;weight&quot; vector. In other words, it scales each column of the dataset</span>
<span class="sd"> by a scalar multiplier.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ep = ElementwiseProduct()</span>
<span class="sd"> &gt;&gt;&gt; ep.setScalingVec(Vectors.dense([1.0, 2.0, 3.0]))</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.setInputCol(&quot;values&quot;)</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.setOutputCol(&quot;eprod&quot;)</span>
<span class="sd"> ElementwiseProduct...</span>
<span class="sd"> &gt;&gt;&gt; ep.transform(df).head().eprod</span>
<span class="sd"> DenseVector([2.0, 2.0, 9.0])</span>
<span class="sd"> &gt;&gt;&gt; ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod</span>
<span class="sd"> DenseVector([4.0, 3.0, 15.0])</span>
<span class="sd"> &gt;&gt;&gt; elementwiseProductPath = temp_path + &quot;/elementwise-product&quot;</span>
<span class="sd"> &gt;&gt;&gt; ep.save(elementwiseProductPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedEp = ElementwiseProduct.load(elementwiseProductPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedEp.getScalingVec() == ep.getScalingVec()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedEp.transform(df).take(1) == ep.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">scalingVec</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;scalingVec&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector for hadamard product.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toVector</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">scalingVec</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">ElementwiseProduct</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.ElementwiseProduct&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="ElementwiseProduct.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">scalingVec</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this ElementwiseProduct.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`scalingVec`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.getScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.getScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of scalingVec or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ElementwiseProduct.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="FeatureHasher"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">FeatureHasher</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasNumFeatures</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature hashing projects a set of categorical or numerical features into a feature vector of</span>
<span class="sd"> specified dimension (typically substantially smaller than that of the original feature</span>
<span class="sd"> space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)</span>
<span class="sd"> to map features to indices in the feature vector.</span>
<span class="sd"> The FeatureHasher transformer operates on multiple columns. Each column may contain either</span>
<span class="sd"> numeric or categorical features. Behavior and handling of column data types is as follows:</span>
<span class="sd"> * Numeric columns:</span>
<span class="sd"> For numeric features, the hash value of the column name is used to map the</span>
<span class="sd"> feature value to its index in the feature vector. By default, numeric features</span>
<span class="sd"> are not treated as categorical (even when they are integers). To treat them</span>
<span class="sd"> as categorical, specify the relevant columns in `categoricalCols`.</span>
<span class="sd"> * String columns:</span>
<span class="sd"> For categorical features, the hash value of the string &quot;column_name=value&quot;</span>
<span class="sd"> is used to map to the vector index, with an indicator value of `1.0`.</span>
<span class="sd"> Thus, categorical features are &quot;one-hot&quot; encoded</span>
<span class="sd"> (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).</span>
<span class="sd"> * Boolean columns:</span>
<span class="sd"> Boolean values are treated in the same way as string columns. That is,</span>
<span class="sd"> boolean features are represented as &quot;column_name=true&quot; or &quot;column_name=false&quot;,</span>
<span class="sd"> with an indicator value of `1.0`.</span>
<span class="sd"> Null (missing) values are ignored (implicitly zero in the resulting feature vector).</span>
<span class="sd"> Since a simple modulo is used to transform the hash function to a vector index,</span>
<span class="sd"> it is advisable to use a power of two as the `numFeatures` parameter;</span>
<span class="sd"> otherwise the features will not be mapped evenly to the vector indices.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(2.0, True, &quot;1&quot;, &quot;foo&quot;), (3.0, False, &quot;2&quot;, &quot;bar&quot;)]</span>
<span class="sd"> &gt;&gt;&gt; cols = [&quot;real&quot;, &quot;bool&quot;, &quot;stringNum&quot;, &quot;string&quot;]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, cols)</span>
<span class="sd"> &gt;&gt;&gt; hasher = FeatureHasher()</span>
<span class="sd"> &gt;&gt;&gt; hasher.setInputCols(cols)</span>
<span class="sd"> FeatureHasher...</span>
<span class="sd"> &gt;&gt;&gt; hasher.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> FeatureHasher...</span>
<span class="sd"> &gt;&gt;&gt; hasher.transform(df).head().features</span>
<span class="sd"> SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hasher.setCategoricalCols([&quot;real&quot;]).transform(df).head().features</span>
<span class="sd"> SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hasherPath = temp_path + &quot;/hasher&quot;</span>
<span class="sd"> &gt;&gt;&gt; hasher.save(hasherPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher = FeatureHasher.load(hasherPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher.getNumFeatures() == hasher.getNumFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedHasher.transform(df).head().features == hasher.transform(df).head().features</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">categoricalCols</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;categoricalCols&quot;</span><span class="p">,</span>
<span class="s2">&quot;numeric columns to treat as categorical&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">categoricalCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numFeatures=1 &lt;&lt; 18, inputCols=None, outputCol=None, \</span>
<span class="sd"> categoricalCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">FeatureHasher</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.FeatureHasher&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="FeatureHasher.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">categoricalCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numFeatures=1 &lt;&lt; 18, inputCols=None, outputCol=None, \</span>
<span class="sd"> categoricalCols=None)</span>
<span class="sd"> Sets params for this FeatureHasher.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`categoricalCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">categoricalCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.getCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.getCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">categoricalCols</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="FeatureHasher.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numFeatures`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="HashingTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">HashingTF</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasNumFeatures</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps a sequence of terms to their term frequencies using the hashing trick.</span>
<span class="sd"> Currently we use Austin Appleby&#39;s MurmurHash 3 algorithm (MurmurHash3_x86_32)</span>
<span class="sd"> to calculate the hash code value for the term object.</span>
<span class="sd"> Since a simple modulo is used to transform the hash function to a column index,</span>
<span class="sd"> it is advisable to use a power of two as the numFeatures parameter;</span>
<span class="sd"> otherwise the features will not be mapped evenly to the columns.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&quot;words&quot;])</span>
<span class="sd"> &gt;&gt;&gt; hashingTF = HashingTF(inputCol=&quot;words&quot;, outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.setNumFeatures(10)</span>
<span class="sd"> HashingTF...</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.transform(df).head().features</span>
<span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; params = {hashingTF.numFeatures: 5, hashingTF.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.transform(df, params).head().vector</span>
<span class="sd"> SparseVector(5, {0: 1.0, 2: 1.0, 3: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; hashingTFPath = temp_path + &quot;/hashing-tf&quot;</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.save(hashingTFPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF = HashingTF.load(hashingTFPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; hashingTF.indexOf(&quot;b&quot;)</span>
<span class="sd"> 5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">binary</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;binary&quot;</span><span class="p">,</span> <span class="s2">&quot;If True, all non zero counts are set to 1. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;This is useful for discrete probabilistic models that model binary events &quot;</span> <span class="o">+</span>
<span class="s2">&quot;rather than integer counts. Default False.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numFeatures=1 &lt;&lt; 18, binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">HashingTF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.HashingTF&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="HashingTF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numFeatures=1 &lt;&lt; 18, binary=False, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this HashingTF.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`binary`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.getBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.getBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of binary or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numFeatures`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="HashingTF.indexOf"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.indexOf">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">indexOf</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">term</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the index of the input term.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_transfer_params_to_java</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">indexOf</span><span class="p">(</span><span class="n">term</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_IDFParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`IDF` and :py:class:`IDFModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">minDocFreq</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;minDocFreq&quot;</span><span class="p">,</span>
<span class="s2">&quot;minimum number of documents in which a term should appear for filtering&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minDocFreq or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_IDFParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<div class="viewcode-block" id="IDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">IDF</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute the Inverse Document Frequency (IDF) given a collection of documents.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import DenseVector</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),</span>
<span class="sd"> ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], [&quot;tf&quot;])</span>
<span class="sd"> &gt;&gt;&gt; idf = IDF(minDocFreq=3)</span>
<span class="sd"> &gt;&gt;&gt; idf.setInputCol(&quot;tf&quot;)</span>
<span class="sd"> IDF...</span>
<span class="sd"> &gt;&gt;&gt; idf.setOutputCol(&quot;idf&quot;)</span>
<span class="sd"> IDF...</span>
<span class="sd"> &gt;&gt;&gt; model = idf.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;idf&quot;)</span>
<span class="sd"> IDFModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getMinDocFreq()</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; model.idf</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.docFreq</span>
<span class="sd"> [0, 3]</span>
<span class="sd"> &gt;&gt;&gt; model.numDocs == df.count()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().idf</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; idf.setParams(outputCol=&quot;freqs&quot;).fit(df).transform(df).collect()[1].freqs</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {idf.minDocFreq: 1, idf.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; idf.fit(df, params).transform(df).head().vector</span>
<span class="sd"> DenseVector([0.2877, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; idfPath = temp_path + &quot;/idf&quot;</span>
<span class="sd"> &gt;&gt;&gt; idf.save(idfPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIdf = IDF.load(idfPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIdf.getMinDocFreq() == idf.getMinDocFreq()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/idf-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = IDFModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).head().idf == model.transform(df).head().idf</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">IDF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.IDF&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="IDF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this IDF.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setMinDocFreq"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setMinDocFreq">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minDocFreq`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">IDFModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDFModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel">[docs]</a><span class="k">class</span> <span class="nc">IDFModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`IDF`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="IDFModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IDFModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">idf</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the IDF vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;idf&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">docFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the document frequency.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;docFreq&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">numDocs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns number of documents evaluated to compute idf</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;numDocs&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_ImputerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Imputer` and :py:class:`ImputerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">strategy</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;strategy&quot;</span><span class="p">,</span>
<span class="s2">&quot;strategy for imputation. If mean, then replace missing values using the mean &quot;</span>
<span class="s2">&quot;value of the feature. If median, then replace missing values using the &quot;</span>
<span class="s2">&quot;median value of the feature. If mode, then replace missing using the most &quot;</span>
<span class="s2">&quot;frequent value of the feature.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">missingValue</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;missingValue&quot;</span><span class="p">,</span>
<span class="s2">&quot;The placeholder for the missing values. All occurrences of missingValue &quot;</span>
<span class="s2">&quot;will be imputed.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_ImputerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">missingValue</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`strategy` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">strategy</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`missingValue` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">missingValue</span><span class="p">)</span>
<div class="viewcode-block" id="Imputer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Imputer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Imputation estimator for completing missing values, using the mean, median or mode</span>
<span class="sd"> of the columns in which the missing values are located. The input columns should be of</span>
<span class="sd"> numeric type. Currently Imputer does not support categorical features and</span>
<span class="sd"> possibly creates incorrect values for a categorical feature.</span>
<span class="sd"> Note that the mean/median/mode value is computed after filtering out missing values.</span>
<span class="sd"> All Null values in the input columns are treated as missing, and so are also imputed. For</span>
<span class="sd"> computing median, :py:meth:`pyspark.sql.DataFrame.approxQuantile` is used with a</span>
<span class="sd"> relative error of `0.001`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&quot;nan&quot;)), (2.0, float(&quot;nan&quot;)), (float(&quot;nan&quot;), 3.0),</span>
<span class="sd"> ... (4.0, 4.0), (5.0, 5.0)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer = Imputer()</span>
<span class="sd"> &gt;&gt;&gt; imputer.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> Imputer...</span>
<span class="sd"> &gt;&gt;&gt; imputer.setOutputCols([&quot;out_a&quot;, &quot;out_b&quot;])</span>
<span class="sd"> Imputer...</span>
<span class="sd"> &gt;&gt;&gt; imputer.getRelativeError()</span>
<span class="sd"> 0.001</span>
<span class="sd"> &gt;&gt;&gt; model = imputer.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> ImputerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getStrategy()</span>
<span class="sd"> &#39;mean&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.surrogateDF.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> |3.0|4.0|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> | a| b|out_a|out_b|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> |1.0|NaN| 1.0| 4.0|</span>
<span class="sd"> |2.0|NaN| 2.0| 4.0|</span>
<span class="sd"> |NaN|3.0| 3.0| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df).transform(df).show()</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> | a| b|out_a|out_b|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> |1.0|NaN| 4.0| NaN|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1.0,), (2.0,), (float(&quot;nan&quot;),), (4.0,), (5.0,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer1 = Imputer(inputCol=&quot;a&quot;, outputCol=&quot;out_a&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model1 = imputer1.fit(df1)</span>
<span class="sd"> &gt;&gt;&gt; model1.surrogateDF.show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | a|</span>
<span class="sd"> +---+</span>
<span class="sd"> |3.0|</span>
<span class="sd"> +---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model1.transform(df1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a|out_a|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |1.0| 1.0|</span>
<span class="sd"> |2.0| 2.0|</span>
<span class="sd"> |NaN| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer1.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df1).transform(df1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a|out_a|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |1.0| 4.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(float(&quot;nan&quot;),), (float(&quot;nan&quot;),), (3.0,), (4.0,), (5.0,)],</span>
<span class="sd"> ... [&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; imputer2 = Imputer(inputCol=&quot;b&quot;, outputCol=&quot;out_b&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model2 = imputer2.fit(df2)</span>
<span class="sd"> &gt;&gt;&gt; model2.surrogateDF.show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | b|</span>
<span class="sd"> +---+</span>
<span class="sd"> |4.0|</span>
<span class="sd"> +---+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model2.transform(df2).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | b|out_b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |NaN| 4.0|</span>
<span class="sd"> |NaN| 4.0|</span>
<span class="sd"> |3.0| 3.0|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputer2.setStrategy(&quot;median&quot;).setMissingValue(1.0).fit(df2).transform(df2).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | b|out_b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |NaN| NaN|</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; imputerPath = temp_path + &quot;/imputer&quot;</span>
<span class="sd"> &gt;&gt;&gt; imputer.save(imputerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer = Imputer.load(imputerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer.getStrategy() == imputer.getStrategy()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedImputer.getMissingValue()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/imputer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = ImputerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).head().out_a == model.transform(df).head().out_a</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">strategy</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">missingValue</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, strategy=&quot;mean&quot;, missingValue=float(&quot;nan&quot;), inputCols=None, \</span>
<span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Imputer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Imputer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Imputer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">strategy</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">missingValue</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;nan&quot;</span><span class="p">),</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, strategy=&quot;mean&quot;, missingValue=float(&quot;nan&quot;), inputCols=None, \</span>
<span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> Sets params for this Imputer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setStrategy"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setStrategy">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`strategy`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setMissingValue"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setMissingValue">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`missingValue`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">missingValue</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Imputer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">ImputerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel">[docs]</a><span class="k">class</span> <span class="nc">ImputerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`Imputer`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="ImputerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="ImputerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">surrogateDF</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a DataFrame containing inputCols and their corresponding surrogates,</span>
<span class="sd"> which are used to replace the missing values in the input DataFrame.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;surrogateDF&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Interaction</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the feature interaction transform. This transformer takes in Double and Vector type</span>
<span class="sd"> columns and outputs a flattened vector of their feature interactions. To handle interaction,</span>
<span class="sd"> we first one-hot encode any nominal features. Then, a vector of the feature cross-products is</span>
<span class="sd"> produced.</span>
<span class="sd"> For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be</span>
<span class="sd"> `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal</span>
<span class="sd"> with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; interaction = Interaction()</span>
<span class="sd"> &gt;&gt;&gt; interaction.setInputCols([&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> Interaction...</span>
<span class="sd"> &gt;&gt;&gt; interaction.setOutputCol(&quot;ab&quot;)</span>
<span class="sd"> Interaction...</span>
<span class="sd"> &gt;&gt;&gt; interaction.transform(df).show()</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | a| b| ab|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> |0.0|1.0|[0.0]|</span>
<span class="sd"> |2.0|3.0|[6.0]|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; interactionPath = temp_path + &quot;/interaction&quot;</span>
<span class="sd"> &gt;&gt;&gt; interaction.save(interactionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInteraction = Interaction.load(interactionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None):</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Interaction</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Interaction&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Interaction.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None)</span>
<span class="sd"> Sets params for this Interaction.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Interaction.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_MaxAbsScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">pass</span>
<div class="viewcode-block" id="MaxAbsScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MaxAbsScaler</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_MaxAbsScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Rescale each feature individually to range [-1, 1] by dividing through the largest maximum</span>
<span class="sd"> absolute value in each feature. It does not shift/center the data, and thus does not destroy</span>
<span class="sd"> any sparsity.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; maScaler = MaxAbsScaler(outputCol=&quot;scaled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; maScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> MaxAbsScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = maScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;scaledOutput&quot;)</span>
<span class="sd"> MaxAbsScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | a|scaledOutput|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |[1.0]| [0.5]|</span>
<span class="sd"> |[2.0]| [1.0]|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; scalerPath = temp_path + &quot;/max-abs-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; maScaler.save(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler = MaxAbsScaler.load(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler.getInputCol() == maScaler.getInputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedMAScaler.getOutputCol() == maScaler.getOutputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/max-abs-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = MaxAbsScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.maxAbs == model.maxAbs</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MaxAbsScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MaxAbsScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MaxAbsScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this MaxAbsScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">MaxAbsScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MaxAbsScalerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MaxAbsScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`MaxAbsScaler`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="MaxAbsScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MaxAbsScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">maxAbs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Max Abs vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;maxAbs&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MinHashLSH</span><span class="p">(</span><span class="n">_LSH</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> LSH class for Jaccard distance.</span>
<span class="sd"> The input can be dense or sparse vectors, but it is more efficient if it is sparse.</span>
<span class="sd"> For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements</span>
<span class="sd"> in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at</span>
<span class="sd"> least 1 non-zero index, and all non-zero values are treated as binary &quot;1&quot; values.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See `Wikipedia on MinHash &lt;https://en.wikipedia.org/wiki/MinHash&gt;`_</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; mh = MinHashLSH()</span>
<span class="sd"> &gt;&gt;&gt; mh.setInputCol(&quot;features&quot;)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; mh.setOutputCol(&quot;hashes&quot;)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; mh.setSeed(12345)</span>
<span class="sd"> MinHashLSH...</span>
<span class="sd"> &gt;&gt;&gt; model = mh.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;features&quot;)</span>
<span class="sd"> MinHashLSHModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head()</span>
<span class="sd"> Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668...</span>
<span class="sd"> &gt;&gt;&gt; data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),</span>
<span class="sd"> ... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(data2, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; key = Vectors.sparse(6, [1, 2], [1.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; model.approxNearestNeighbors(df2, key, 1).collect()</span>
<span class="sd"> [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668...</span>
<span class="sd"> &gt;&gt;&gt; model.approxSimilarityJoin(df, df2, 0.6, distCol=&quot;JaccardDistance&quot;).select(</span>
<span class="sd"> ... col(&quot;datasetA.id&quot;).alias(&quot;idA&quot;),</span>
<span class="sd"> ... col(&quot;datasetB.id&quot;).alias(&quot;idB&quot;),</span>
<span class="sd"> ... col(&quot;JaccardDistance&quot;)).show()</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> |idA|idB|JaccardDistance|</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> | 0| 5| 0.5|</span>
<span class="sd"> | 1| 4| 0.5|</span>
<span class="sd"> +---+---+---------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; mhPath = temp_path + &quot;/mh&quot;</span>
<span class="sd"> &gt;&gt;&gt; mh.save(mhPath)</span>
<span class="sd"> &gt;&gt;&gt; mh2 = MinHashLSH.load(mhPath)</span>
<span class="sd"> &gt;&gt;&gt; mh2.getOutputCol() == mh.getOutputCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/mh-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model2 = MinHashLSHModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().hashes == model2.transform(df).head().hashes</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MinHashLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MinHashLSH&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MinHashLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span>
<span class="sd"> Sets params for this MinHashLSH.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">MinHashLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinHashLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSHModel.html#pyspark.ml.feature.MinHashLSHModel">[docs]</a><span class="k">class</span> <span class="nc">MinHashLSHModel</span><span class="p">(</span><span class="n">_LSHModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each</span>
<span class="sd"> hash function is picked from the following family of hash functions, where :math:`a_i` and</span>
<span class="sd"> :math:`b_i` are randomly chosen integers less than prime:</span>
<span class="sd"> :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise</span>
<span class="sd"> independent according to the reference.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See Tom Bohman, Colin Cooper, and Alan Frieze. &quot;Min-wise independent linear permutations.&quot;</span>
<span class="sd"> Electronic Journal of Combinatorics 7 (2000): R26.</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<span class="k">class</span> <span class="nc">_MinMaxScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">min</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;min&quot;</span><span class="p">,</span> <span class="s2">&quot;Lower bound of the output feature range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="nb">max</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;max&quot;</span><span class="p">,</span> <span class="s2">&quot;Upper bound of the output feature range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMin</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of min or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">min</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMax</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of max or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">max</span><span class="p">)</span>
<div class="viewcode-block" id="MinMaxScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">MinMaxScaler</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Rescale each feature individually to a common range [min, max] linearly using column summary</span>
<span class="sd"> statistics, which is also known as min-max normalization or Rescaling. The rescaled value for</span>
<span class="sd"> feature E is calculated as,</span>
<span class="sd"> Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min</span>
<span class="sd"> For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Since zero values will probably be transformed to non-zero values, output of the</span>
<span class="sd"> transformer will be DenseVector even for sparse input.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; mmScaler = MinMaxScaler(outputCol=&quot;scaled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; mmScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> MinMaxScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = mmScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;scaledOutput&quot;)</span>
<span class="sd"> MinMaxScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.originalMin</span>
<span class="sd"> DenseVector([0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.originalMax</span>
<span class="sd"> DenseVector([2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | a|scaledOutput|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |[0.0]| [0.0]|</span>
<span class="sd"> |[2.0]| [1.0]|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; minMaxScalerPath = temp_path + &quot;/min-max-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; mmScaler.save(minMaxScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler = MinMaxScaler.load(minMaxScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler.getMin() == mmScaler.getMin()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedMMScaler.getMax() == mmScaler.getMax()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/min-max-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = MinMaxScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.originalMin == model.originalMin</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.originalMax == model.originalMax</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="nb">min</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">MinMaxScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.MinMaxScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="MinMaxScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="nb">min</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this MinMaxScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`min`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`max`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">MinMaxScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MinMaxScalerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`MinMaxScaler`.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="MinMaxScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`min`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="MinMaxScalerModel.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`max`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">originalMin</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Min value for each original column during fitting.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;originalMin&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">originalMax</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Max value for each original column during fitting.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;originalMax&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">NGram</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that converts the input array of strings into an array of n-grams. Null</span>
<span class="sd"> values in the input array are ignored.</span>
<span class="sd"> It returns an array of n-grams where each n-gram is represented by a space-separated string of</span>
<span class="sd"> words.</span>
<span class="sd"> When the input is empty, an empty array is returned.</span>
<span class="sd"> When the input array length is less than n (number of elements per n-gram), no n-grams are</span>
<span class="sd"> returned.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(inputTokens=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;, &quot;d&quot;, &quot;e&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; ngram = NGram(n=2)</span>
<span class="sd"> &gt;&gt;&gt; ngram.setInputCol(&quot;inputTokens&quot;)</span>
<span class="sd"> NGram...</span>
<span class="sd"> &gt;&gt;&gt; ngram.setOutputCol(&quot;nGrams&quot;)</span>
<span class="sd"> NGram...</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b&#39;, &#39;b c&#39;, &#39;c d&#39;, &#39;d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change n-gram length</span>
<span class="sd"> &gt;&gt;&gt; ngram.setParams(n=4).transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify output column.</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df, {ngram.outputCol: &quot;output&quot;}).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], output=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;], nGrams=[&#39;a b c d&#39;, &#39;b c d e&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; ngram.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; ngramPath = temp_path + &quot;/ngram&quot;</span>
<span class="sd"> &gt;&gt;&gt; ngram.save(ngramPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram = NGram.load(ngramPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram.getN() == ngram.getN()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedNGram.transform(df).take(1) == ngram.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">n</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;n&quot;</span><span class="p">,</span> <span class="s2">&quot;number of elements per n-gram (&gt;=1)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">NGram</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.NGram&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="NGram.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this NGram.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setN</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`n`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.getN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.getN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getN</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of n or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="NGram.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="Normalizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Normalizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Normalize a vector to have unit norm using the given p-norm.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], [&quot;dense&quot;, &quot;sparse&quot;])</span>
<span class="sd"> &gt;&gt;&gt; normalizer = Normalizer(p=2.0)</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setInputCol(&quot;dense&quot;)</span>
<span class="sd"> Normalizer...</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setOutputCol(&quot;features&quot;)</span>
<span class="sd"> Normalizer...</span>
<span class="sd"> &gt;&gt;&gt; normalizer.transform(df).head().features</span>
<span class="sd"> DenseVector([0.6, -0.8])</span>
<span class="sd"> &gt;&gt;&gt; normalizer.setParams(inputCol=&quot;sparse&quot;, outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> SparseVector(4, {1: 0.8, 3: 0.6})</span>
<span class="sd"> &gt;&gt;&gt; params = {normalizer.p: 1.0, normalizer.inputCol: &quot;dense&quot;, normalizer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; normalizer.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.4286, -0.5714])</span>
<span class="sd"> &gt;&gt;&gt; normalizerPath = temp_path + &quot;/normalizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; normalizer.save(normalizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer = Normalizer.load(normalizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer.getP() == normalizer.getP()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">p</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;p&quot;</span><span class="p">,</span> <span class="s2">&quot;the p norm value.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Normalizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Normalizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Normalizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Normalizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setP</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`p`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.getP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.getP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getP</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of p or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">p</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Normalizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_OneHotEncoderParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;How to handle invalid data during &quot;</span> <span class="o">+</span>
<span class="s2">&quot;transform(). Options are &#39;keep&#39; (invalid data presented as an extra &quot;</span> <span class="o">+</span>
<span class="s2">&quot;categorical feature) or error (throw an error). Note that this Param &quot;</span> <span class="o">+</span>
<span class="s2">&quot;is only used during transform; during fitting, invalid data will &quot;</span> <span class="o">+</span>
<span class="s2">&quot;result in an error.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">dropLast</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;dropLast&quot;</span><span class="p">,</span> <span class="s2">&quot;whether to drop the last category&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of dropLast or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span><span class="p">)</span>
<div class="viewcode-block" id="OneHotEncoder"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">OneHotEncoder</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A one-hot encoder that maps a column of category indices to a column of binary vectors, with</span>
<span class="sd"> at most a single one-value per row that indicates the input category index.</span>
<span class="sd"> For example with 5 categories, an input value of 2.0 would map to an output vector of</span>
<span class="sd"> `[0.0, 0.0, 1.0, 0.0]`.</span>
<span class="sd"> The last category is not included by default (configurable via :py:attr:`dropLast`),</span>
<span class="sd"> because it makes the vector entries sum up to one, and hence linearly dependent.</span>
<span class="sd"> So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.</span>
<span class="sd"> When :py:attr:`handleInvalid` is configured to &#39;keep&#39;, an extra &quot;category&quot; indicating invalid</span>
<span class="sd"> values is added as last category. So when :py:attr:`dropLast` is true, invalid values are</span>
<span class="sd"> encoded as all-zeros vector.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is different from scikit-learn&#39;s OneHotEncoder, which keeps all categories.</span>
<span class="sd"> The output vectors are sparse.</span>
<span class="sd"> When encoding multi-column by using :py:attr:`inputCols` and</span>
<span class="sd"> :py:attr:`outputCols` params, input/output cols come in pairs, specified by the order in</span>
<span class="sd"> the arrays, and each pair is treated independently.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> StringIndexer : for converting categorical values into category indices</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], [&quot;input&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ohe = OneHotEncoder()</span>
<span class="sd"> &gt;&gt;&gt; ohe.setInputCols([&quot;input&quot;])</span>
<span class="sd"> OneHotEncoder...</span>
<span class="sd"> &gt;&gt;&gt; ohe.setOutputCols([&quot;output&quot;])</span>
<span class="sd"> OneHotEncoder...</span>
<span class="sd"> &gt;&gt;&gt; model = ohe.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCols([&quot;output&quot;])</span>
<span class="sd"> OneHotEncoderModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().output</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; single_col_ohe = OneHotEncoder(inputCol=&quot;input&quot;, outputCol=&quot;output&quot;)</span>
<span class="sd"> &gt;&gt;&gt; single_col_model = single_col_ohe.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; single_col_model.transform(df).head().output</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> &gt;&gt;&gt; ohePath = temp_path + &quot;/ohe&quot;</span>
<span class="sd"> &gt;&gt;&gt; ohe.save(ohePath)</span>
<span class="sd"> &gt;&gt;&gt; loadedOHE = OneHotEncoder.load(ohePath)</span>
<span class="sd"> &gt;&gt;&gt; loadedOHE.getInputCols() == ohe.getInputCols()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/ohe-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = OneHotEncoderModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.categorySizes == model.categorySizes</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCols=None, handleInvalid=&quot;error&quot;, dropLast=True, \</span>
<span class="sd"> inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">OneHotEncoder</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.OneHotEncoder&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="OneHotEncoder.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="n">dropLast</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCols=None, handleInvalid=&quot;error&quot;, \</span>
<span class="sd"> dropLast=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this OneHotEncoder.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`dropLast`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoder.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">OneHotEncoderModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel">[docs]</a><span class="k">class</span> <span class="nc">OneHotEncoderModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`OneHotEncoder`.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="OneHotEncoderModel.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`dropLast`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="OneHotEncoderModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">categorySizes</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Original number of categories for each feature being encoded.</span>
<span class="sd"> The array contains one value for each input column, in order.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;categorySizes&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PolynomialExpansion</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion</span>
<span class="sd"> &lt;http://en.wikipedia.org/wiki/Polynomial_expansion&gt;`_, &quot;In mathematics, an</span>
<span class="sd"> expansion of a product of sums expresses it as a sum of products by using the fact that</span>
<span class="sd"> multiplication distributes over addition&quot;. Take a 2-variable feature vector as an example:</span>
<span class="sd"> `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], [&quot;dense&quot;])</span>
<span class="sd"> &gt;&gt;&gt; px = PolynomialExpansion(degree=2)</span>
<span class="sd"> &gt;&gt;&gt; px.setInputCol(&quot;dense&quot;)</span>
<span class="sd"> PolynomialExpansion...</span>
<span class="sd"> &gt;&gt;&gt; px.setOutputCol(&quot;expanded&quot;)</span>
<span class="sd"> PolynomialExpansion...</span>
<span class="sd"> &gt;&gt;&gt; px.transform(df).head().expanded</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; px.setParams(outputCol=&quot;test&quot;).transform(df).head().test</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; polyExpansionPath = temp_path + &quot;/poly-expansion&quot;</span>
<span class="sd"> &gt;&gt;&gt; px.save(polyExpansionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPx = PolynomialExpansion.load(polyExpansionPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPx.getDegree() == px.getDegree()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedPx.transform(df).take(1) == px.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">degree</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;degree&quot;</span><span class="p">,</span> <span class="s2">&quot;the polynomial degree to expand (&gt;= 1)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PolynomialExpansion</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.PolynomialExpansion&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="PolynomialExpansion.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this PolynomialExpansion.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`degree`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.getDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.getDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of degree or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PolynomialExpansion.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="QuantileDiscretizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">QuantileDiscretizer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :py:class:`QuantileDiscretizer` takes a column with continuous features and outputs a column</span>
<span class="sd"> with binned categorical features. The number of bins can be set using the :py:attr:`numBuckets`</span>
<span class="sd"> parameter. It is possible that the number of buckets used will be less than this value, for</span>
<span class="sd"> example, if there are too few distinct values of the input to create enough distinct quantiles.</span>
<span class="sd"> Since 3.0.0, :py:class:`QuantileDiscretizer` can map multiple columns at once by setting the</span>
<span class="sd"> :py:attr:`inputCols` parameter. If both of the :py:attr:`inputCol` and :py:attr:`inputCols`</span>
<span class="sd"> parameters are set, an Exception will be thrown. To specify the number of buckets for each</span>
<span class="sd"> column, the :py:attr:`numBucketsArray` parameter can be set, or if the number of buckets</span>
<span class="sd"> should be the same across columns, :py:attr:`numBuckets` can be set as a convenience.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> NaN handling: Note also that</span>
<span class="sd"> :py:class:`QuantileDiscretizer` will raise an error when it finds NaN values in the dataset,</span>
<span class="sd"> but the user can also choose to either keep or remove NaN values within the dataset by setting</span>
<span class="sd"> :py:attr:`handleInvalid` parameter. If the user chooses to keep NaN values, they will be</span>
<span class="sd"> handled specially and placed into their own bucket, for example, if 4 buckets are used, then</span>
<span class="sd"> non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].</span>
<span class="sd"> Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for</span>
<span class="sd"> :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description).</span>
<span class="sd"> The precision of the approximation can be controlled with the</span>
<span class="sd"> :py:attr:`relativeError` parameter.</span>
<span class="sd"> The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(&quot;nan&quot;),), (float(&quot;nan&quot;),)]</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame(values, [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds1 = QuantileDiscretizer(inputCol=&quot;values&quot;, outputCol=&quot;buckets&quot;)</span>
<span class="sd"> &gt;&gt;&gt; qds1.setNumBuckets(2)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.setRelativeError(0.01)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> QuantileDiscretizer...</span>
<span class="sd"> &gt;&gt;&gt; qds1.getRelativeError()</span>
<span class="sd"> 0.01</span>
<span class="sd"> &gt;&gt;&gt; bucketizer = qds1.fit(df1)</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;keep&quot;).fit(df1).transform(df1).count()</span>
<span class="sd"> 6</span>
<span class="sd"> &gt;&gt;&gt; qds1.setHandleInvalid(&quot;skip&quot;).fit(df1).transform(df1).count()</span>
<span class="sd"> 4</span>
<span class="sd"> &gt;&gt;&gt; splits = bucketizer.getSplits()</span>
<span class="sd"> &gt;&gt;&gt; splits[0]</span>
<span class="sd"> -inf</span>
<span class="sd"> &gt;&gt;&gt; print(&quot;%2.1f&quot; % round(splits[1], 1))</span>
<span class="sd"> 0.4</span>
<span class="sd"> &gt;&gt;&gt; bucketed = bucketizer.transform(df1).head()</span>
<span class="sd"> &gt;&gt;&gt; bucketed.buckets</span>
<span class="sd"> 0.0</span>
<span class="sd"> &gt;&gt;&gt; quantileDiscretizerPath = temp_path + &quot;/quantile-discretizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; qds1.save(quantileDiscretizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedQds = QuantileDiscretizer.load(quantileDiscretizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedQds.getNumBuckets() == qds1.getNumBuckets()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; inputs = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, 1.5),</span>
<span class="sd"> ... (float(&quot;nan&quot;), float(&quot;nan&quot;)), (float(&quot;nan&quot;), float(&quot;nan&quot;))]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(inputs, [&quot;input1&quot;, &quot;input2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds2 = QuantileDiscretizer(relativeError=0.01, handleInvalid=&quot;error&quot;, numBuckets=2,</span>
<span class="sd"> ... inputCols=[&quot;input1&quot;, &quot;input2&quot;], outputCols=[&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds2.getRelativeError()</span>
<span class="sd"> 0.01</span>
<span class="sd"> &gt;&gt;&gt; qds2.setHandleInvalid(&quot;keep&quot;).fit(df2).transform(df2).show()</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> |input1|input2|output1|output2|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> | 0.1| 0.0| 0.0| 0.0|</span>
<span class="sd"> | 0.4| 1.0| 1.0| 1.0|</span>
<span class="sd"> | 1.2| 1.3| 1.0| 1.0|</span>
<span class="sd"> | 1.5| 1.5| 1.0| 1.0|</span>
<span class="sd"> | NaN| NaN| 2.0| 2.0|</span>
<span class="sd"> | NaN| NaN| 2.0| 2.0|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; qds3 = QuantileDiscretizer(relativeError=0.01, handleInvalid=&quot;error&quot;,</span>
<span class="sd"> ... numBucketsArray=[5, 10], inputCols=[&quot;input1&quot;, &quot;input2&quot;],</span>
<span class="sd"> ... outputCols=[&quot;output1&quot;, &quot;output2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; qds3.setHandleInvalid(&quot;skip&quot;).fit(df2).transform(df2).show()</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> |input1|input2|output1|output2|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> | 0.1| 0.0| 1.0| 1.0|</span>
<span class="sd"> | 0.4| 1.0| 2.0| 2.0|</span>
<span class="sd"> | 1.2| 1.3| 3.0| 3.0|</span>
<span class="sd"> | 1.5| 1.5| 4.0| 4.0|</span>
<span class="sd"> +------+------+-------+-------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">numBuckets</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;numBuckets&quot;</span><span class="p">,</span>
<span class="s2">&quot;Maximum number of buckets (quantiles, or &quot;</span> <span class="o">+</span>
<span class="s2">&quot;categories) into which data points are grouped. Must be &gt;= 2.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;how to handle invalid entries. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Options are skip (filter out rows with invalid values), &quot;</span> <span class="o">+</span>
<span class="s2">&quot;error (throw an error), or keep (keep invalid values in a special &quot;</span> <span class="o">+</span>
<span class="s2">&quot;additional bucket). Note that in the multiple columns &quot;</span> <span class="o">+</span>
<span class="s2">&quot;case, the invalid handling is applied to all columns. That said &quot;</span> <span class="o">+</span>
<span class="s2">&quot;for &#39;error&#39; it will throw an error if any invalids are found in &quot;</span> <span class="o">+</span>
<span class="s2">&quot;any columns, for &#39;skip&#39; it will skip rows with any invalids in &quot;</span> <span class="o">+</span>
<span class="s2">&quot;any columns, etc.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">numBucketsArray</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;numBucketsArray&quot;</span><span class="p">,</span> <span class="s2">&quot;Array of number of buckets &quot;</span> <span class="o">+</span>
<span class="s2">&quot;(quantiles, or categories) into which data points are grouped. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;This is for multiple columns input. If transforming multiple &quot;</span> <span class="o">+</span>
<span class="s2">&quot;columns and numBucketsArray is not set, but numBuckets is set, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;then numBuckets will be applied across all columns.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">numBucketsArray</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, numBucketsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">QuantileDiscretizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.QuantileDiscretizer&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="QuantileDiscretizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">numBucketsArray</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, numBucketsArray=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Set the params for the QuantileDiscretizer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numBuckets`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.getNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numBuckets or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBuckets</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numBucketsArray`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBucketsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.getNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numBucketsArray or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBucketsArray</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="QuantileDiscretizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Private method to convert the java_model to a Python model.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">isSet</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inputCol</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()),</span>
<span class="n">inputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCol</span><span class="p">(),</span>
<span class="n">outputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">(),</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">splitsArrayList</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplitsArray</span><span class="p">())]</span>
<span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span><span class="n">splitsArray</span><span class="o">=</span><span class="n">splitsArrayList</span><span class="p">,</span>
<span class="n">inputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCols</span><span class="p">(),</span>
<span class="n">outputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCols</span><span class="p">(),</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">())</span></div>
<span class="k">class</span> <span class="nc">_RobustScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`RobustScaler` and :py:class:`RobustScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">lower</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;lower&quot;</span><span class="p">,</span> <span class="s2">&quot;Lower quantile to calculate quantile range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">upper</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;upper&quot;</span><span class="p">,</span> <span class="s2">&quot;Upper quantile to calculate quantile range&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">withCentering</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;withCentering&quot;</span><span class="p">,</span> <span class="s2">&quot;Whether to center data with median&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="n">withScaling</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;withScaling&quot;</span><span class="p">,</span> <span class="s2">&quot;Whether to scale the data to &quot;</span>
<span class="s2">&quot;quantile range&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">lower</span><span class="o">=</span><span class="mf">0.25</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="mf">0.75</span><span class="p">,</span> <span class="n">withCentering</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withScaling</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLower</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of lower or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lower</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of upper or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">upper</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withCentering or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withCentering</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withScaling or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withScaling</span><span class="p">)</span>
<div class="viewcode-block" id="RobustScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RobustScaler</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> RobustScaler removes the median and scales the data according to the quantile range.</span>
<span class="sd"> The quantile range is by default IQR (Interquartile Range, quantile range between the</span>
<span class="sd"> 1st quartile = 25th quantile and the 3rd quartile = 75th quantile) but can be configured.</span>
<span class="sd"> Centering and scaling happen independently on each feature by computing the relevant</span>
<span class="sd"> statistics on the samples in the training set. Median and quantile range are then</span>
<span class="sd"> stored to be used on later data using the transform method.</span>
<span class="sd"> Note that NaN values are ignored in the computation of medians and ranges.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(0, Vectors.dense([0.0, 0.0]),),</span>
<span class="sd"> ... (1, Vectors.dense([1.0, -1.0]),),</span>
<span class="sd"> ... (2, Vectors.dense([2.0, -2.0]),),</span>
<span class="sd"> ... (3, Vectors.dense([3.0, -3.0]),),</span>
<span class="sd"> ... (4, Vectors.dense([4.0, -4.0]),),]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;id&quot;, &quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; scaler = RobustScaler()</span>
<span class="sd"> &gt;&gt;&gt; scaler.setInputCol(&quot;features&quot;)</span>
<span class="sd"> RobustScaler...</span>
<span class="sd"> &gt;&gt;&gt; scaler.setOutputCol(&quot;scaled&quot;)</span>
<span class="sd"> RobustScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = scaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> RobustScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.median</span>
<span class="sd"> DenseVector([2.0, -2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.range</span>
<span class="sd"> DenseVector([2.0, 2.0])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[1].output</span>
<span class="sd"> DenseVector([0.5, -0.5])</span>
<span class="sd"> &gt;&gt;&gt; scalerPath = temp_path + &quot;/robust-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; scaler.save(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler = RobustScaler.load(scalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler.getWithCentering() == scaler.getWithCentering()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedScaler.getWithScaling() == scaler.getWithScaling()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/robust-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = RobustScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.median == model.median</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.range == model.range</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">lower</span><span class="o">=</span><span class="mf">0.25</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="mf">0.75</span><span class="p">,</span> <span class="n">withCentering</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withScaling</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span>
<span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RobustScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RobustScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RobustScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">lower</span><span class="o">=</span><span class="mf">0.25</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="mf">0.75</span><span class="p">,</span> <span class="n">withCentering</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withScaling</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span>
<span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span>
<span class="sd"> Sets params for this RobustScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setLower"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setLower">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLower</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`lower`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">lower</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setUpper"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setUpper">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`upper`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">upper</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setWithCentering"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithCentering">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withCentering`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withCentering</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setWithScaling"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithScaling">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withScaling`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withScaling</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScaler.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`relativeError`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">RobustScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel">[docs]</a><span class="k">class</span> <span class="nc">RobustScalerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`RobustScaler`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="RobustScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RobustScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Median of the RobustScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;median&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">range</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Quantile range of the RobustScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;range&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RegexTokenizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A regex based tokenizer that extracts tokens either by using the</span>
<span class="sd"> provided regex pattern (in Java dialect) to split the text</span>
<span class="sd"> (default) or repeatedly matching the regex (if gaps is false).</span>
<span class="sd"> Optional parameters also allow filtering tokens using a minimal</span>
<span class="sd"> length.</span>
<span class="sd"> It returns an array of strings that can be empty.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;A B c&quot;,)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer = RegexTokenizer()</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setInputCol(&quot;text&quot;)</span>
<span class="sd"> RegexTokenizer...</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setOutputCol(&quot;words&quot;)</span>
<span class="sd"> RegexTokenizer...</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change a parameter.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setParams(outputCol=&quot;tokens&quot;).transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify a parameter.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df, {reTokenizer.outputCol: &quot;words&quot;}).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;A B c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; regexTokenizerPath = temp_path + &quot;/regex-tokenizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; reTokenizer.save(regexTokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.getGaps() == reTokenizer.getGaps()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedReTokenizer.transform(df).take(1) == reTokenizer.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">minTokenLength</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;minTokenLength&quot;</span><span class="p">,</span> <span class="s2">&quot;minimum token length (&gt;= 0)&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">gaps</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;gaps&quot;</span><span class="p">,</span> <span class="s2">&quot;whether regex splits on gaps (True) or matches tokens &quot;</span> <span class="o">+</span>
<span class="s2">&quot;(False)&quot;</span><span class="p">)</span>
<span class="n">pattern</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;pattern&quot;</span><span class="p">,</span> <span class="s2">&quot;regex pattern (Java dialect) used for tokenizing&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">toLowercase</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;toLowercase&quot;</span><span class="p">,</span> <span class="s2">&quot;whether to convert all characters to &quot;</span> <span class="o">+</span>
<span class="s2">&quot;lowercase before tokenizing&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">toLowercase</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, minTokenLength=1, gaps=True, pattern=&quot;\\s+&quot;, inputCol=None, \</span>
<span class="sd"> outputCol=None, toLowercase=True)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RegexTokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RegexTokenizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span> <span class="n">toLowercase</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RegexTokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">s+&quot;</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">toLowercase</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minTokenLength=1, gaps=True, pattern=&quot;\\s+&quot;, inputCol=None, \</span>
<span class="sd"> outputCol=None, toLowercase=True)</span>
<span class="sd"> Sets params for this RegexTokenizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minTokenLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minTokenLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`gaps`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">gaps</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of gaps or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">gaps</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`pattern`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">pattern</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of pattern or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pattern</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`toLowercase`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">toLowercase</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.getToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of toLowercase or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">toLowercase</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RegexTokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="SQLTransformer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">SQLTransformer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the transforms which are defined by SQL statement.</span>
<span class="sd"> Currently we only support SQL syntax like `SELECT ... FROM __THIS__`</span>
<span class="sd"> where `__THIS__` represents the underlying table of the input dataset.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], [&quot;id&quot;, &quot;v1&quot;, &quot;v2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans = SQLTransformer(</span>
<span class="sd"> ... statement=&quot;SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__&quot;)</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans.transform(df).head()</span>
<span class="sd"> Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)</span>
<span class="sd"> &gt;&gt;&gt; sqlTransformerPath = temp_path + &quot;/sql-transformer&quot;</span>
<span class="sd"> &gt;&gt;&gt; sqlTrans.save(sqlTransformerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans = SQLTransformer.load(sqlTransformerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans.getStatement() == sqlTrans.getStatement()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedSqlTrans.transform(df).take(1) == sqlTrans.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">statement</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;statement&quot;</span><span class="p">,</span> <span class="s2">&quot;SQL statement&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, statement=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">SQLTransformer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.SQLTransformer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="SQLTransformer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, statement=None)</span>
<span class="sd"> Sets params for this SQLTransformer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="SQLTransformer.setStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`statement`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">statement</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="SQLTransformer.getStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.getStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of statement or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">statement</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_StandardScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`StandardScaler` and :py:class:`StandardScalerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">withMean</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;withMean&quot;</span><span class="p">,</span> <span class="s2">&quot;Center data with mean&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="n">withStd</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;withStd&quot;</span><span class="p">,</span> <span class="s2">&quot;Scale to unit standard deviation&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_StandardScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withMean or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withMean</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of withStd or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withStd</span><span class="p">)</span>
<div class="viewcode-block" id="StandardScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">StandardScaler</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_StandardScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Standardizes features by removing the mean and scaling to unit variance using column summary</span>
<span class="sd"> statistics on the samples in the training set.</span>
<span class="sd"> The &quot;unit std&quot; is computed using the `corrected sample standard deviation \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation&gt;`_,</span>
<span class="sd"> which is computed as the square root of the unbiased sample variance.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; standardScaler = StandardScaler()</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.setInputCol(&quot;a&quot;)</span>
<span class="sd"> StandardScaler...</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.setOutputCol(&quot;scaled&quot;)</span>
<span class="sd"> StandardScaler...</span>
<span class="sd"> &gt;&gt;&gt; model = standardScaler.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getInputCol()</span>
<span class="sd"> &#39;a&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> StandardScalerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.mean</span>
<span class="sd"> DenseVector([1.0])</span>
<span class="sd"> &gt;&gt;&gt; model.std</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[1].output</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> &gt;&gt;&gt; standardScalerPath = temp_path + &quot;/standard-scaler&quot;</span>
<span class="sd"> &gt;&gt;&gt; standardScaler.save(standardScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler = StandardScaler.load(standardScalerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler.getWithMean() == standardScaler.getWithMean()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedStandardScaler.getWithStd() == standardScaler.getWithStd()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/standard-scaler-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = StandardScalerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.std == model.std</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.mean == model.mean</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StandardScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.StandardScaler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="StandardScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this StandardScaler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setWithMean"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithMean">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withMean`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setWithStd"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithStd">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`withStd`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withStd</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StandardScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel">[docs]</a><span class="k">class</span> <span class="nc">StandardScalerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_StandardScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`StandardScaler`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="StandardScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StandardScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Standard deviation of the StandardScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;std&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mean of the StandardScalerModel.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_StringIndexerParams</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span>
<span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`StringIndexer` and :py:class:`StringIndexerModel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stringOrderType</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;stringOrderType&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to order labels of string column. The first label after &quot;</span> <span class="o">+</span>
<span class="s2">&quot;ordering is assigned an index of 0. Supported options: &quot;</span> <span class="o">+</span>
<span class="s2">&quot;frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Default is frequencyDesc. In case of equal frequency when &quot;</span> <span class="o">+</span>
<span class="s2">&quot;under frequencyDesc/Asc, the strings are further sorted &quot;</span> <span class="o">+</span>
<span class="s2">&quot;alphabetically&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;how to handle invalid data (unseen &quot;</span> <span class="o">+</span>
<span class="s2">&quot;or NULL values) in features and label column of string type. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Options are &#39;skip&#39; (filter out rows with invalid data), &quot;</span> <span class="o">+</span>
<span class="s2">&quot;error (throw an error), or &#39;keep&#39; (put invalid data &quot;</span> <span class="o">+</span>
<span class="s2">&quot;in a special additional bucket, at index numLabels).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stringOrderType` or its default value &#39;frequencyDesc&#39;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringOrderType</span><span class="p">)</span>
<div class="viewcode-block" id="StringIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">StringIndexer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A label indexer that maps a string column of labels to an ML column of label indices.</span>
<span class="sd"> If the input column is numeric, we cast it to string and index the string values.</span>
<span class="sd"> The indices are in [0, numLabels). By default, this is ordered by label frequencies</span>
<span class="sd"> so the most frequent label gets index 0. The ordering behavior is controlled by</span>
<span class="sd"> setting :py:attr:`stringOrderType`. Its default value is &#39;frequencyDesc&#39;.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;,</span>
<span class="sd"> ... stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> StringIndexer...</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; model.setHandleInvalid(&quot;error&quot;)</span>
<span class="sd"> StringIndexerModel...</span>
<span class="sd"> &gt;&gt;&gt; td = model.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span>
<span class="sd"> &gt;&gt;&gt; inverter = IndexToString(inputCol=&quot;indexed&quot;, outputCol=&quot;label2&quot;, labels=model.labels)</span>
<span class="sd"> &gt;&gt;&gt; itd = inverter.transform(td)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, &#39;a&#39;), (1, &#39;b&#39;), (2, &#39;c&#39;), (3, &#39;a&#39;), (4, &#39;a&#39;), (5, &#39;c&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; stringIndexerPath = temp_path + &quot;/string-indexer&quot;</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.save(stringIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer = StringIndexer.load(stringIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/string-indexer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = StringIndexerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.labels == model.labels</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; indexToStringPath = temp_path + &quot;/index-to-string&quot;</span>
<span class="sd"> &gt;&gt;&gt; inverter.save(indexToStringPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInverter = IndexToString.load(indexToStringPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedInverter.getLabels() == inverter.getLabels()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer.getStringOrderType()</span>
<span class="sd"> &#39;frequencyDesc&#39;</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;, handleInvalid=&quot;error&quot;,</span>
<span class="sd"> ... stringOrderType=&quot;alphabetDesc&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; td = model.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; fromlabelsModel = StringIndexerModel.from_labels([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],</span>
<span class="sd"> ... inputCol=&quot;label&quot;, outputCol=&quot;indexed&quot;, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result = fromlabelsModel.transform(stringIndDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)]</span>
<span class="sd"> &gt;&gt;&gt; testData = sc.parallelize([Row(id=0, label1=&quot;a&quot;, label2=&quot;e&quot;),</span>
<span class="sd"> ... Row(id=1, label1=&quot;b&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=2, label1=&quot;c&quot;, label2=&quot;e&quot;),</span>
<span class="sd"> ... Row(id=3, label1=&quot;a&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=4, label1=&quot;a&quot;, label2=&quot;f&quot;),</span>
<span class="sd"> ... Row(id=5, label1=&quot;c&quot;, label2=&quot;f&quot;)], 3)</span>
<span class="sd"> &gt;&gt;&gt; multiRowDf = spark.createDataFrame(testData)</span>
<span class="sd"> &gt;&gt;&gt; inputs = [&quot;label1&quot;, &quot;label2&quot;]</span>
<span class="sd"> &gt;&gt;&gt; outputs = [&quot;index1&quot;, &quot;index2&quot;]</span>
<span class="sd"> &gt;&gt;&gt; stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)</span>
<span class="sd"> &gt;&gt;&gt; model = stringIndexer.fit(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; result = model.transform(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span>
<span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0)]</span>
<span class="sd"> &gt;&gt;&gt; fromlabelsModel = StringIndexerModel.from_arrays_of_labels([[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], [&quot;e&quot;, &quot;f&quot;]],</span>
<span class="sd"> ... inputCols=inputs, outputCols=outputs)</span>
<span class="sd"> &gt;&gt;&gt; result = fromlabelsModel.transform(multiRowDf)</span>
<span class="sd"> &gt;&gt;&gt; sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span>
<span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 1.0), (5, 2.0, 1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StringIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.StringIndexer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="StringIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;, stringOrderType=&quot;frequencyDesc&quot;)</span>
<span class="sd"> Sets params for this StringIndexer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StringIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<div class="viewcode-block" id="StringIndexer.setStringOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setStringOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stringOrderType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="StringIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">StringIndexerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`StringIndexer`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="StringIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StringIndexerModel.from_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_labels">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_labels</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">labels</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from an array of label strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">labels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.StringIndexerModel&quot;</span><span class="p">,</span> <span class="n">jlabels</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span>
<span class="k">return</span> <span class="n">model</span></div>
<div class="viewcode-block" id="StringIndexerModel.from_arrays_of_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_arrays_of_labels">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">from_arrays_of_labels</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">arrayOfLabels</span><span class="p">,</span> <span class="n">inputCols</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct the model directly from an array of array of label strings,</span>
<span class="sd"> requires an active SparkContext.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span>
<span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">arrayOfLabels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.StringIndexerModel&quot;</span><span class="p">,</span> <span class="n">jlabels</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">setInputCols</span><span class="p">(</span><span class="n">inputCols</span><span class="p">)</span>
<span class="k">if</span> <span class="n">outputCols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setOutputCols</span><span class="p">(</span><span class="n">outputCols</span><span class="p">)</span>
<span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span>
<span class="k">return</span> <span class="n">model</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">labels</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Ordered list of labels, corresponding to indices to be assigned.</span>
<span class="sd"> .. deprecated:: 3.1.0</span>
<span class="sd"> It will be removed in future versions. Use `labelsArray` method instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;labels&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.2&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">labelsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Array of ordered list of labels, corresponding to indices to be assigned</span>
<span class="sd"> for each input column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;labelsArray&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">IndexToString</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A :py:class:`pyspark.ml.base.Transformer` that maps a column of indices back to a new column of</span>
<span class="sd"> corresponding string values.</span>
<span class="sd"> The index-string mapping is either from the ML attributes of the input column,</span>
<span class="sd"> or from user-supplied labels (which take precedence over ML attributes).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> StringIndexer : for converting categorical values into category indices</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">labels</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;labels&quot;</span><span class="p">,</span>
<span class="s2">&quot;Optional array of labels specifying index-string mapping.&quot;</span> <span class="o">+</span>
<span class="s2">&quot; If not provided or if empty, then metadata from inputCol is used instead.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, labels=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">IndexToString</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.IndexToString&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="IndexToString.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, labels=None)</span>
<span class="sd"> Sets params for this IndexToString.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labels`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labels</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.getLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.getLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`labels` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="IndexToString.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="StopWordsRemover"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover">[docs]</a><span class="k">class</span> <span class="nc">StopWordsRemover</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that filters out stop words from input.</span>
<span class="sd"> Since 3.0.0, :py:class:`StopWordsRemover` can filter out multiple columns at once by setting</span>
<span class="sd"> the :py:attr:`inputCols` parameter. Note that when both the :py:attr:`inputCol` and</span>
<span class="sd"> :py:attr:`inputCols` parameters are set, an Exception will be thrown.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> null values from input array are preserved unless adding null to stopWords explicitly.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover = StopWordsRemover(stopWords=[&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover.setInputCol(&quot;text&quot;)</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover.setOutputCol(&quot;words&quot;)</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover.transform(df).head().words == [&#39;a&#39;, &#39;c&#39;]</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; stopWordsRemoverPath = temp_path + &quot;/stopwords-remover&quot;</span>
<span class="sd"> &gt;&gt;&gt; remover.save(stopWordsRemoverPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover = StopWordsRemover.load(stopWordsRemoverPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.getStopWords() == remover.getStopWords()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.getCaseSensitive() == remover.getCaseSensitive()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRemover.transform(df).take(1) == remover.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], [&quot;a&quot;, &quot;b&quot;])], [&quot;text1&quot;, &quot;text2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover2 = StopWordsRemover(stopWords=[&quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; remover2.setInputCols([&quot;text1&quot;, &quot;text2&quot;]).setOutputCols([&quot;words1&quot;, &quot;words2&quot;])</span>
<span class="sd"> StopWordsRemover...</span>
<span class="sd"> &gt;&gt;&gt; remover2.transform(df2).show()</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> | text1| text2|words1|words2|</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> |[a, b, c]|[a, b]|[a, c]| [a]|</span>
<span class="sd"> +---------+------+------+------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stopWords</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;stopWords&quot;</span><span class="p">,</span> <span class="s2">&quot;The words to be filtered out&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">)</span>
<span class="n">caseSensitive</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;caseSensitive&quot;</span><span class="p">,</span> <span class="s2">&quot;whether to do a case sensitive &quot;</span> <span class="o">+</span>
<span class="s2">&quot;comparison over the stop words&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="n">locale</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;locale&quot;</span><span class="p">,</span> <span class="s2">&quot;locale of the input. ignored when case sensitive &quot;</span> <span class="o">+</span>
<span class="s2">&quot;is true&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">stopWords</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">caseSensitive</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">locale</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span>
<span class="sd"> locale=None, inputCols=None, outputCols=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StopWordsRemover</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.StopWordsRemover&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">stopWords</span><span class="o">=</span><span class="n">StopWordsRemover</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="s2">&quot;english&quot;</span><span class="p">),</span>
<span class="n">caseSensitive</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">locale</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">getLocale</span><span class="p">())</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="StopWordsRemover.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">stopWords</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">caseSensitive</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">locale</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span>
<span class="sd"> locale=None, inputCols=None, outputCols=None)</span>
<span class="sd"> Sets params for this StopWordRemover.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stopWords`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stopWords</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stopWords` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stopWords</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`caseSensitive`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">caseSensitive</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`caseSensitive` or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">caseSensitive</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`locale`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">locale</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.getLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`locale`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">locale</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="StopWordsRemover.loadDefaultStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.loadDefaultStopWords">[docs]</a> <span class="nd">@staticmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Loads the default stop words for the given language.</span>
<span class="sd"> Supported languages: danish, dutch, english, finnish, french, german, hungarian,</span>
<span class="sd"> italian, norwegian, portuguese, russian, spanish, swedish, turkish</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stopWordsObj</span> <span class="o">=</span> <span class="n">_jvm</span><span class="p">()</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="n">StopWordsRemover</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">stopWordsObj</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">))</span></div></div>
<div class="viewcode-block" id="Tokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Tokenizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A tokenizer that converts the input string to lowercase and then</span>
<span class="sd"> splits it by white spaces.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a b c&quot;,)], [&quot;text&quot;])</span>
<span class="sd"> &gt;&gt;&gt; tokenizer = Tokenizer(outputCol=&quot;words&quot;)</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setInputCol(&quot;text&quot;)</span>
<span class="sd"> Tokenizer...</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Change a parameter.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setParams(outputCol=&quot;tokens&quot;).transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Temporarily modify a parameter.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df, {tokenizer.outputCol: &quot;words&quot;}).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, words=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=&#39;a b c&#39;, tokens=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; # Must use keyword arguments to specify params.</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.setParams(&quot;text&quot;)</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> &gt;&gt;&gt; tokenizerPath = temp_path + &quot;/tokenizer&quot;</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.save(tokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedTokenizer = Tokenizer.load(tokenizerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Tokenizer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Tokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Tokenizer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Tokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Tokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<div class="viewcode-block" id="VectorAssembler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorAssembler</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that merges multiple columns into a vector column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 0, 3)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler = VectorAssembler(outputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.setInputCols([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> VectorAssembler...</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.transform(df).head().features</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.setParams(outputCol=&quot;freqs&quot;).transform(df).head().freqs</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {vecAssembler.inputCols: [&quot;b&quot;, &quot;a&quot;], vecAssembler.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorAssemblerPath = temp_path + &quot;/vector-assembler&quot;</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler.save(vectorAssemblerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedAssembler = VectorAssembler.load(vectorAssemblerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedAssembler.transform(df).head().freqs == vecAssembler.transform(df).head().freqs</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; dfWithNullsAndNaNs = spark.createDataFrame(</span>
<span class="sd"> ... [(1.0, 2.0, None), (3.0, float(&quot;nan&quot;), 4.0), (5.0, 6.0, 7.0)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2 = VectorAssembler(inputCols=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], outputCol=&quot;features&quot;,</span>
<span class="sd"> ... handleInvalid=&quot;keep&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2.transform(dfWithNullsAndNaNs).show()</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> | a| b| c| features|</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> |1.0|2.0|null|[1.0,2.0,NaN]|</span>
<span class="sd"> |3.0|NaN| 4.0|[3.0,NaN,4.0]|</span>
<span class="sd"> |5.0|6.0| 7.0|[5.0,6.0,7.0]|</span>
<span class="sd"> +---+---+----+-------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler2.setParams(handleInvalid=&quot;skip&quot;).transform(dfWithNullsAndNaNs).show()</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> | a| b| c| features|</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> |5.0|6.0|7.0|[5.0,6.0,7.0]|</span>
<span class="sd"> +---+---+---+-------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;How to handle invalid data (NULL &quot;</span> <span class="o">+</span>
<span class="s2">&quot;and NaN values). Options are &#39;skip&#39; (filter out rows with invalid &quot;</span> <span class="o">+</span>
<span class="s2">&quot;data), &#39;error&#39; (throw an error), or &#39;keep&#39; (return relevant number &quot;</span> <span class="o">+</span>
<span class="s2">&quot;of NaN in the output). Column lengths are taken from the size of ML &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Attribute Group, which can be set using `VectorSizeHint` in a &quot;</span> <span class="o">+</span>
<span class="s2">&quot;pipeline before `VectorAssembler`. Column lengths can also be &quot;</span> <span class="o">+</span>
<span class="s2">&quot;inferred from first rows of the data since it is safe to do so but &quot;</span> <span class="o">+</span>
<span class="s2">&quot;only in case of &#39;error&#39; or &#39;skip&#39;).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorAssembler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorAssembler&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorAssembler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorAssembler.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCols`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorAssembler.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_VectorIndexerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`VectorIndexer` and :py:class:`VectorIndexerModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">maxCategories</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;maxCategories&quot;</span><span class="p">,</span>
<span class="s2">&quot;Threshold for the number of values a categorical feature can take &quot;</span> <span class="o">+</span>
<span class="s2">&quot;(&gt;= 2). If a feature is found to have &gt; maxCategories values, then &quot;</span> <span class="o">+</span>
<span class="s2">&quot;it is declared continuous.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;How to handle invalid data &quot;</span> <span class="o">+</span>
<span class="s2">&quot;(unseen labels or NULL values). Options are &#39;skip&#39; (filter out &quot;</span> <span class="o">+</span>
<span class="s2">&quot;rows with invalid data), &#39;error&#39; (throw an error), or &#39;keep&#39; (put &quot;</span> <span class="o">+</span>
<span class="s2">&quot;invalid data in a special additional bucket, at index of the number &quot;</span> <span class="o">+</span>
<span class="s2">&quot;of categories of the feature).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxCategories or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">)</span>
<div class="viewcode-block" id="VectorIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorIndexer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Class for indexing categorical feature columns in a dataset of `Vector`.</span>
<span class="sd"> This has 2 usage modes:</span>
<span class="sd"> - Automatically identify categorical features (default behavior)</span>
<span class="sd"> - This helps process a dataset of unknown vectors into a dataset with some continuous</span>
<span class="sd"> features and some categorical features. The choice between continuous and categorical</span>
<span class="sd"> is based upon a maxCategories parameter.</span>
<span class="sd"> - Set maxCategories to the maximum number of categorical any categorical feature should</span>
<span class="sd"> have.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},</span>
<span class="sd"> and feature 1 will be declared continuous.</span>
<span class="sd"> - Index all features, if all features are categorical</span>
<span class="sd"> - If maxCategories is set to be very large, then this will build an index of unique</span>
<span class="sd"> values for all features.</span>
<span class="sd"> - Warning: This can cause problems if features are continuous since this will collect ALL</span>
<span class="sd"> unique values to the driver.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories &gt;= 3, then both features will be declared categorical.</span>
<span class="sd"> This returns a model which can transform categorical features to use 0-based indices.</span>
<span class="sd"> Index stability:</span>
<span class="sd"> - This is not guaranteed to choose the same category index across multiple runs.</span>
<span class="sd"> - If a categorical feature includes value 0, then this is guaranteed to map value 0 to</span>
<span class="sd"> index 0. This maintains vector sparsity.</span>
<span class="sd"> - More stability may be added in the future.</span>
<span class="sd"> TODO: Future extensions: The following functionality is planned for the future:</span>
<span class="sd"> - Preserve metadata in transform; if a feature&#39;s metadata is already present,</span>
<span class="sd"> do not recompute.</span>
<span class="sd"> - Specify certain features to not index, either via a parameter or via existing metadata.</span>
<span class="sd"> - Add warning if a categorical feature has only 1 category.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; indexer = VectorIndexer(maxCategories=2, inputCol=&quot;a&quot;)</span>
<span class="sd"> &gt;&gt;&gt; indexer.setOutputCol(&quot;indexed&quot;)</span>
<span class="sd"> VectorIndexer...</span>
<span class="sd"> &gt;&gt;&gt; model = indexer.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; indexer.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> VectorIndexerModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().output</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.numFeatures</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.categoryMaps</span>
<span class="sd"> {0: {0.0: 0, -1.0: 1}}</span>
<span class="sd"> &gt;&gt;&gt; indexer.setParams(outputCol=&quot;test&quot;).fit(df).transform(df).collect()[1].test</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; params = {indexer.maxCategories: 3, indexer.outputCol: &quot;vector&quot;}</span>
<span class="sd"> &gt;&gt;&gt; model2 = indexer.fit(df, params)</span>
<span class="sd"> &gt;&gt;&gt; model2.transform(df).head().vector</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorIndexerPath = temp_path + &quot;/vector-indexer&quot;</span>
<span class="sd"> &gt;&gt;&gt; indexer.save(vectorIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer = VectorIndexer.load(vectorIndexerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedIndexer.getMaxCategories() == indexer.getMaxCategories()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/vector-indexer-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = VectorIndexerModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.numFeatures == model.numFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.categoryMaps == model.categoryMaps</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; indexer.getHandleInvalid()</span>
<span class="sd"> &#39;error&#39;</span>
<span class="sd"> &gt;&gt;&gt; model3 = indexer.setHandleInvalid(&quot;skip&quot;).fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model3.transform(dfWithInvalid).count()</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; model4 = indexer.setParams(handleInvalid=&quot;keep&quot;, outputCol=&quot;indexed&quot;).fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model4.transform(dfWithInvalid).head().indexed</span>
<span class="sd"> DenseVector([2.0, 1.0])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorIndexer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorIndexer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setMaxCategories"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setMaxCategories">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxCategories`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">VectorIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">VectorIndexerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`VectorIndexer`.</span>
<span class="sd"> Transform categorical features to use 0-based indices instead of their original values.</span>
<span class="sd"> - Categorical features are mapped to indices.</span>
<span class="sd"> - Continuous features (columns) are left unchanged.</span>
<span class="sd"> This also appends metadata to the output column, marking features as Numeric (continuous),</span>
<span class="sd"> Nominal (categorical), or Binary (either continuous or categorical).</span>
<span class="sd"> Non-ML metadata is not carried over from the input to the output column.</span>
<span class="sd"> This maintains vector sparsity.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="VectorIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">numFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Number of features, i.e., length of Vectors which this transforms.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;numFeatures&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">categoryMaps</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature value index. Keys are categorical feature indices (column indices).</span>
<span class="sd"> Values are maps from original features values to 0-based category indices.</span>
<span class="sd"> If a feature is not in this map, it is treated as continuous.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;javaCategoryMaps&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorSlicer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This class takes a feature vector and outputs a new feature vector with a subarray</span>
<span class="sd"> of the original features.</span>
<span class="sd"> The subset of features can be specified with either indices (`setIndices()`)</span>
<span class="sd"> or names (`setNames()`). At least one feature must be selected. Duplicate features</span>
<span class="sd"> are not allowed, so there can be no overlap between selected indices and names.</span>
<span class="sd"> The output vector will order features with the selected indices first (in the order given),</span>
<span class="sd"> followed by the selected names (in the order given).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], [&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; vs = VectorSlicer(outputCol=&quot;sliced&quot;, indices=[1, 4])</span>
<span class="sd"> &gt;&gt;&gt; vs.setInputCol(&quot;features&quot;)</span>
<span class="sd"> VectorSlicer...</span>
<span class="sd"> &gt;&gt;&gt; vs.transform(df).head().sliced</span>
<span class="sd"> DenseVector([2.3, 1.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorSlicerPath = temp_path + &quot;/vector-slicer&quot;</span>
<span class="sd"> &gt;&gt;&gt; vs.save(vectorSlicerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedVs = VectorSlicer.load(vectorSlicerPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.getIndices() == vs.getIndices()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.getNames() == vs.getNames()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedVs.transform(df).take(1) == vs.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">indices</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;indices&quot;</span><span class="p">,</span> <span class="s2">&quot;An array of indices to select features from &quot;</span> <span class="o">+</span>
<span class="s2">&quot;a vector column. There can be no overlap with names.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">)</span>
<span class="n">names</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;names&quot;</span><span class="p">,</span> <span class="s2">&quot;An array of feature names to select features from &quot;</span> <span class="o">+</span>
<span class="s2">&quot;a vector column. These names must be specified by ML &quot;</span> <span class="o">+</span>
<span class="s2">&quot;org.apache.spark.ml.attribute.Attribute. There can be no overlap with &quot;</span> <span class="o">+</span>
<span class="s2">&quot;indices.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">indices</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, indices=None, names=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorSlicer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorSlicer&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="p">[],</span> <span class="n">names</span><span class="o">=</span><span class="p">[])</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorSlicer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">indices</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, indices=None, names=None):</span>
<span class="sd"> Sets params for this VectorSlicer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`indices`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.getIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of indices or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">indices</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNames</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`names`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">names</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.getNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.6.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNames</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of names or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">names</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSlicer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_Word2VecParams</span><span class="p">(</span><span class="n">HasStepSize</span><span class="p">,</span> <span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Word2Vec` and :py:class:`Word2VecModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">vectorSize</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;vectorSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;the dimension of codes after transforming from words&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">numPartitions</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;numPartitions&quot;</span><span class="p">,</span>
<span class="s2">&quot;number of partitions for sentences of words&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">minCount</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;minCount&quot;</span><span class="p">,</span>
<span class="s2">&quot;the minimum number of times a token must appear to be included in the &quot;</span> <span class="o">+</span>
<span class="s2">&quot;word2vec model&#39;s vocabulary&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">windowSize</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;windowSize&quot;</span><span class="p">,</span>
<span class="s2">&quot;the window size (context words from [-window, window]). Default value is 5&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">maxSentenceLength</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;maxSentenceLength&quot;</span><span class="p">,</span>
<span class="s2">&quot;Maximum length (in words) of each sentence in the input data. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Any sentence longer than this threshold will &quot;</span> <span class="o">+</span>
<span class="s2">&quot;be divided into chunks up to the size.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_Word2VecParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">windowSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">maxSentenceLength</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of vectorSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numPartitions or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of minCount or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minCount</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of windowSize or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">windowSize</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of maxSentenceLength or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxSentenceLength</span><span class="p">)</span>
<div class="viewcode-block" id="Word2Vec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Word2Vec</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_Word2VecParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further</span>
<span class="sd"> natural language processing or machine learning process.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; sent = (&quot;a b &quot; * 100 + &quot;a c &quot; * 10).split(&quot; &quot;)</span>
<span class="sd"> &gt;&gt;&gt; doc = spark.createDataFrame([(sent,), (sent,)], [&quot;sentence&quot;])</span>
<span class="sd"> &gt;&gt;&gt; word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol=&quot;sentence&quot;, outputCol=&quot;model&quot;)</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.setMaxIter(10)</span>
<span class="sd"> Word2Vec...</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.getMaxIter()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.clear(word2Vec.maxIter)</span>
<span class="sd"> &gt;&gt;&gt; model = word2Vec.fit(doc)</span>
<span class="sd"> &gt;&gt;&gt; model.getMinCount()</span>
<span class="sd"> 5</span>
<span class="sd"> &gt;&gt;&gt; model.setInputCol(&quot;sentence&quot;)</span>
<span class="sd"> Word2VecModel...</span>
<span class="sd"> &gt;&gt;&gt; model.getVectors().show()</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> |word| vector|</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> | a|[0.0951...</span>
<span class="sd"> | b|[-1.202...</span>
<span class="sd"> | c|[0.3015...</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.findSynonymsArray(&quot;a&quot;, 2)</span>
<span class="sd"> [(&#39;b&#39;, 0.015859...), (&#39;c&#39;, -0.568079...)]</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import format_number as fmt</span>
<span class="sd"> &gt;&gt;&gt; model.findSynonyms(&quot;a&quot;, 2).select(&quot;word&quot;, fmt(&quot;similarity&quot;, 5).alias(&quot;similarity&quot;)).show()</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> |word|similarity|</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> | b| 0.01586|</span>
<span class="sd"> | c| -0.56808|</span>
<span class="sd"> +----+----------+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(doc).head().model</span>
<span class="sd"> DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769])</span>
<span class="sd"> &gt;&gt;&gt; word2vecPath = temp_path + &quot;/word2vec&quot;</span>
<span class="sd"> &gt;&gt;&gt; word2Vec.save(word2vecPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec = Word2Vec.load(word2vecPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedWord2Vec.getMinCount() == word2Vec.getMinCount()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/word2vec-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = Word2VecModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.getVectors().first().word == model.getVectors().first().word</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.getVectors().first().vector == model.getVectors().first().vector</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(doc).take(1) == model.transform(doc).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span>
<span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">windowSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="n">maxSentenceLength</span><span class="o">=</span><span class="mi">1000</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, \</span>
<span class="sd"> maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, \</span>
<span class="sd"> maxSentenceLength=1000)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Word2Vec</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.Word2Vec&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Word2Vec.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span>
<span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">windowSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="n">maxSentenceLength</span><span class="o">=</span><span class="mi">1000</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \</span>
<span class="sd"> seed=None, inputCol=None, outputCol=None, windowSize=5, \</span>
<span class="sd"> maxSentenceLength=1000)</span>
<span class="sd"> Sets params for this Word2Vec.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setVectorSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setVectorSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`vectorSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setNumPartitions"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setNumPartitions">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numPartitions`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numPartitions</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMinCount"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMinCount">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`minCount`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minCount</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setWindowSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setWindowSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`windowSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">windowSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMaxSentenceLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxSentenceLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxSentenceLength`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxSentenceLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxIter">[docs]</a> <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`maxIter`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`seed`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2Vec.setStepSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setStepSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.4.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStepSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stepSize`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stepSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Word2VecModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel">[docs]</a><span class="k">class</span> <span class="nc">Word2VecModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_Word2VecParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`Word2Vec`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="Word2VecModel.getVectors"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.getVectors">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVectors</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the vector representation of the words as a dataframe</span>
<span class="sd"> with two fields, word and vector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;getVectors&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.findSynonyms"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonyms">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">findSynonyms</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Find &quot;num&quot; number of words closest in similarity to &quot;word&quot;.</span>
<span class="sd"> word can be a string or vector representation.</span>
<span class="sd"> Returns a dataframe with two fields word and similarity (which</span>
<span class="sd"> gives the cosine similarity).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;findSynonyms&quot;</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span></div>
<div class="viewcode-block" id="Word2VecModel.findSynonymsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonymsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">findSynonymsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Find &quot;num&quot; number of words closest in similarity to &quot;word&quot;.</span>
<span class="sd"> word can be a string or vector representation.</span>
<span class="sd"> Returns an array with two fields word and similarity (which</span>
<span class="sd"> gives the cosine similarity).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="n">tuples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">findSynonymsArray</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">st</span><span class="p">:</span> <span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">_1</span><span class="p">(),</span> <span class="n">st</span><span class="o">.</span><span class="n">_2</span><span class="p">()),</span> <span class="nb">list</span><span class="p">(</span><span class="n">tuples</span><span class="p">)))</span></div></div>
<span class="k">class</span> <span class="nc">_PCAParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`PCA` and :py:class:`PCAModel`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">k</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;k&quot;</span><span class="p">,</span> <span class="s2">&quot;the number of principal components&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of k or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
<div class="viewcode-block" id="PCA"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PCA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> PCA trains a model to project vectors to a lower dimensional space of the</span>
<span class="sd"> top :py:attr:`k` principal components.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),</span>
<span class="sd"> ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),</span>
<span class="sd"> ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data,[&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; pca = PCA(k=2, inputCol=&quot;features&quot;)</span>
<span class="sd"> &gt;&gt;&gt; pca.setOutputCol(&quot;pca_features&quot;)</span>
<span class="sd"> PCA...</span>
<span class="sd"> &gt;&gt;&gt; model = pca.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getK()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; model.setOutputCol(&quot;output&quot;)</span>
<span class="sd"> PCAModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).collect()[0].output</span>
<span class="sd"> DenseVector([1.648..., -4.013...])</span>
<span class="sd"> &gt;&gt;&gt; model.explainedVariance</span>
<span class="sd"> DenseVector([0.794..., 0.205...])</span>
<span class="sd"> &gt;&gt;&gt; pcaPath = temp_path + &quot;/pca&quot;</span>
<span class="sd"> &gt;&gt;&gt; pca.save(pcaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPca = PCA.load(pcaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPca.getK() == pca.getK()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/pca-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = PCAModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.pc == model.pc</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.explainedVariance == model.explainedVariance</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PCA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.PCA&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="PCA.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Set params for this PCA.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCA.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">PCAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel">[docs]</a><span class="k">class</span> <span class="nc">PCAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="PCAModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="PCAModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a principal components Matrix.</span>
<span class="sd"> Each column is one principal component.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;pc&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">explainedVariance</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a vector of proportions of variance</span>
<span class="sd"> explained by each principal component.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;explainedVariance&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_RFormulaParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`RFormula` and :py:class:`RFormula`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">formula</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;formula&quot;</span><span class="p">,</span> <span class="s2">&quot;R model formula&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">forceIndexLabel</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;forceIndexLabel&quot;</span><span class="p">,</span>
<span class="s2">&quot;Force to index label whether it is numeric or string&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">)</span>
<span class="n">stringIndexerOrderType</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;stringIndexerOrderType&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to order categories of a string feature column used by &quot;</span> <span class="o">+</span>
<span class="s2">&quot;StringIndexer. The last category after ordering is dropped &quot;</span> <span class="o">+</span>
<span class="s2">&quot;when encoding strings. Supported options: frequencyDesc, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;frequencyAsc, alphabetDesc, alphabetAsc. The default value &quot;</span> <span class="o">+</span>
<span class="s2">&quot;is frequencyDesc. When the ordering is set to alphabetDesc, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;RFormula drops the same category as R when encoding strings.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span> <span class="s2">&quot;how to handle invalid entries. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Options are &#39;skip&#39; (filter out rows with invalid values), &quot;</span> <span class="o">+</span>
<span class="s2">&quot;&#39;error&#39; (throw an error), or &#39;keep&#39; (put invalid data in a special &quot;</span> <span class="o">+</span>
<span class="s2">&quot;additional bucket, at index numLabels).&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_RFormulaParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">forceIndexLabel</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`formula`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`forceIndexLabel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">forceIndexLabel</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of :py:attr:`stringIndexerOrderType` or its default value &#39;frequencyDesc&#39;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringIndexerOrderType</span><span class="p">)</span>
<div class="viewcode-block" id="RFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">RFormula</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_RFormulaParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Implements the transforms required for fitting a dataset against an</span>
<span class="sd"> R model formula. Currently we support a limited subset of the R</span>
<span class="sd"> operators, including &#39;~&#39;, &#39;.&#39;, &#39;:&#39;, &#39;+&#39;, &#39;-&#39;, &#39;*&#39;, and &#39;^&#39;.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Also see the `R formula docs</span>
<span class="sd"> &lt;http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html&gt;`_.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (1.0, 1.0, &quot;a&quot;),</span>
<span class="sd"> ... (0.0, 2.0, &quot;b&quot;),</span>
<span class="sd"> ... (0.0, 0.0, &quot;a&quot;)</span>
<span class="sd"> ... ], [&quot;y&quot;, &quot;x&quot;, &quot;s&quot;])</span>
<span class="sd"> &gt;&gt;&gt; rf = RFormula(formula=&quot;y ~ x + s&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = rf.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getLabelCol()</span>
<span class="sd"> &#39;label&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).show()</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> | y| x| s| features|label|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; rf.fit(df, {rf.formula: &quot;y ~ . - s&quot;}).transform(df).show()</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> | y| x| s|features|label|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> |1.0|1.0| a| [1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b| [2.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a| [0.0]| 0.0|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; rFormulaPath = temp_path + &quot;/rFormula&quot;</span>
<span class="sd"> &gt;&gt;&gt; rf.save(rFormulaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRF = RFormula.load(rFormulaPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getFormula() == rf.getFormula()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getFeaturesCol() == rf.getFeaturesCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getLabelCol() == rf.getLabelCol()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedRF.getHandleInvalid() == rf.getHandleInvalid()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; str(loadedRF)</span>
<span class="sd"> &#39;RFormula(y ~ x + s) (uid=...)&#39;</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/rFormulaModel&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = RFormulaModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.uid == model.uid</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).show()</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> | y| x| s| features|label|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; str(loadedModel)</span>
<span class="sd"> &#39;RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">formula</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">forceIndexLabel</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, formula=None, featuresCol=&quot;features&quot;, labelCol=&quot;label&quot;, \</span>
<span class="sd"> forceIndexLabel=False, stringIndexerOrderType=&quot;frequencyDesc&quot;, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RFormula</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.RFormula&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="RFormula.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">formula</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span>
<span class="n">forceIndexLabel</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">,</span>
<span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, formula=None, featuresCol=&quot;features&quot;, labelCol=&quot;label&quot;, \</span>
<span class="sd"> forceIndexLabel=False, stringIndexerOrderType=&quot;frequencyDesc&quot;, \</span>
<span class="sd"> handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for RFormula.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFormula">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.5.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`formula`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">formula</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setForceIndexLabel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setForceIndexLabel">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`forceIndexLabel`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">forceIndexLabel</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setStringIndexerOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setStringIndexerOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`stringIndexerOrderType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormula.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">RFormulaModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">formulaStr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getFormula</span><span class="p">()</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">isDefined</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span> <span class="k">else</span> <span class="s2">&quot;&quot;</span>
<span class="k">return</span> <span class="s2">&quot;RFormula(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">formulaStr</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div>
<div class="viewcode-block" id="RFormulaModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormulaModel.html#pyspark.ml.feature.RFormulaModel">[docs]</a><span class="k">class</span> <span class="nc">RFormulaModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RFormulaParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`RFormula`. Fitting is required to determine the</span>
<span class="sd"> factor levels of formula terms.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">resolvedFormula</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;resolvedFormula&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="s2">&quot;RFormulaModel(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">resolvedFormula</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_SelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`Selector` and :py:class:`SelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">selectorType</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;selectorType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The selector type. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">numTopFeatures</span> <span class="o">=</span> \
<span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span>
<span class="s2">&quot;Number of features that selector will select, ordered by ascending p-value. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;If the number of features is &lt; numTopFeatures, then this will select &quot;</span> <span class="o">+</span>
<span class="s2">&quot;all features.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">percentile</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;percentile&quot;</span><span class="p">,</span> <span class="s2">&quot;Percentile of features that selector &quot;</span> <span class="o">+</span>
<span class="s2">&quot;will select, ordered by ascending p-value.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">fpr</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;fpr&quot;</span><span class="p">,</span> <span class="s2">&quot;The highest p-value for features to be kept.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">fdr</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;fdr&quot;</span><span class="p">,</span> <span class="s2">&quot;The upper bound of the expected false discovery rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="n">fwe</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;fwe&quot;</span><span class="p">,</span> <span class="s2">&quot;The upper bound of the expected family-wise error rate.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_SelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span> <span class="n">percentile</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
<span class="n">fpr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> <span class="n">fdr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> <span class="n">fwe</span><span class="o">=</span><span class="mf">0.05</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectorType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectorType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of numTopFeatures or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numTopFeatures</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of percentile or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">percentile</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fpr or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fpr</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fdr or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fdr</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of fwe or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fwe</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_Selector</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_SelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Selectors.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectorType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectorType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`numTopFeatures`.</span>
<span class="sd"> Only applicable when selectorType = &quot;numTopFeatures&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`percentile`.</span>
<span class="sd"> Only applicable when selectorType = &quot;percentile&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">percentile</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fpr`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fpr&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fpr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fdr`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fdr&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fdr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.2.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`fwe`.</span>
<span class="sd"> Only applicable when selectorType = &quot;fwe&quot;.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fwe</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">_SelectorModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_SelectorParams</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Mixin for Selector models.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span>
<div class="viewcode-block" id="ChiSqSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">ChiSqSelector</span><span class="p">(</span><span class="n">_Selector</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Chi-Squared feature selection, which selects categorical features to use for predicting a</span>
<span class="sd"> categorical label.</span>
<span class="sd"> The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,</span>
<span class="sd"> `fdr`, `fwe`.</span>
<span class="sd"> * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.</span>
<span class="sd"> * `percentile` is similar but chooses a fraction of all features</span>
<span class="sd"> instead of a fixed number.</span>
<span class="sd"> * `fpr` chooses all features whose p-values are below a threshold,</span>
<span class="sd"> thus controlling the false positive rate of selection.</span>
<span class="sd"> * `fdr` uses the `Benjamini-Hochberg procedure &lt;https://en.wikipedia.org/wiki/</span>
<span class="sd"> False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure&gt;`_</span>
<span class="sd"> to choose all features whose false discovery rate is below a threshold.</span>
<span class="sd"> * `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span>
<span class="sd"> 1/numFeatures, thus controlling the family-wise error rate of selection.</span>
<span class="sd"> By default, the selection method is `numTopFeatures`, with the default number of top features</span>
<span class="sd"> set to 50.</span>
<span class="sd"> .. deprecated:: 3.1.0</span>
<span class="sd"> Use UnivariateFeatureSelector</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),</span>
<span class="sd"> ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),</span>
<span class="sd"> ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],</span>
<span class="sd"> ... [&quot;features&quot;, &quot;label&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = ChiSqSelector(numTopFeatures=1, outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> ChiSqSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([18.0])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [2]</span>
<span class="sd"> &gt;&gt;&gt; chiSqSelectorPath = temp_path + &quot;/chi-sq-selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(chiSqSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = ChiSqSelector.load(chiSqSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/chi-sq-selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = ChiSqSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span> <span class="n">percentile</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> <span class="n">fpr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="n">fdr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> <span class="n">fwe</span><span class="o">=</span><span class="mf">0.05</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, numTopFeatures=50, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectorType=&quot;numTopFeatures&quot;, percentile=0.1, fpr=0.05, \</span>
<span class="sd"> fdr=0.05, fwe=0.05)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">ChiSqSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.ChiSqSelector&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="ChiSqSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;labels&quot;</span><span class="p">,</span> <span class="n">selectorType</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">,</span> <span class="n">percentile</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> <span class="n">fpr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span>
<span class="n">fdr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> <span class="n">fwe</span><span class="o">=</span><span class="mf">0.05</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, numTopFeatures=50, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;labels&quot;, selectorType=&quot;numTopFeatures&quot;, percentile=0.1, fpr=0.05, \</span>
<span class="sd"> fdr=0.05, fwe=0.05)</span>
<span class="sd"> Sets params for this ChiSqSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">ChiSqSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="ChiSqSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelectorModel.html#pyspark.ml.feature.ChiSqSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">ChiSqSelectorModel</span><span class="p">(</span><span class="n">_SelectorModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`ChiSqSelector`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="VectorSizeHint"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VectorSizeHint</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A feature transformer that adds size information to the metadata of a vector column.</span>
<span class="sd"> VectorAssembler needs size information for its input columns and cannot be used on streaming</span>
<span class="sd"> dataframes without this metadata.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> VectorSizeHint modifies `inputCol` to include size metadata and does not have an outputCol.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml import Pipeline, PipelineModel</span>
<span class="sd"> &gt;&gt;&gt; data = [(Vectors.dense([1., 2., 3.]), 4.)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;vector&quot;, &quot;float&quot;])</span>
<span class="sd"> &gt;&gt;&gt;</span>
<span class="sd"> &gt;&gt;&gt; sizeHint = VectorSizeHint(inputCol=&quot;vector&quot;, size=3, handleInvalid=&quot;skip&quot;)</span>
<span class="sd"> &gt;&gt;&gt; vecAssembler = VectorAssembler(inputCols=[&quot;vector&quot;, &quot;float&quot;], outputCol=&quot;assembled&quot;)</span>
<span class="sd"> &gt;&gt;&gt; pipeline = Pipeline(stages=[sizeHint, vecAssembler])</span>
<span class="sd"> &gt;&gt;&gt;</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel = pipeline.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel.transform(df).head().assembled</span>
<span class="sd"> DenseVector([1.0, 2.0, 3.0, 4.0])</span>
<span class="sd"> &gt;&gt;&gt; vectorSizeHintPath = temp_path + &quot;/vector-size-hint-pipeline&quot;</span>
<span class="sd"> &gt;&gt;&gt; pipelineModel.save(vectorSizeHintPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedPipeline = PipelineModel.load(vectorSizeHintPath)</span>
<span class="sd"> &gt;&gt;&gt; loaded = loadedPipeline.transform(df).head().assembled</span>
<span class="sd"> &gt;&gt;&gt; expected = pipelineModel.transform(df).head().assembled</span>
<span class="sd"> &gt;&gt;&gt; loaded == expected</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">size</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;size&quot;</span><span class="p">,</span> <span class="s2">&quot;Size of vectors in column.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">)</span>
<span class="n">handleInvalid</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;handleInvalid&quot;</span><span class="p">,</span>
<span class="s2">&quot;How to handle invalid vectors in inputCol. Invalid vectors include &quot;</span>
<span class="s2">&quot;nulls and vectors with the wrong size. The options are `skip` (filter &quot;</span>
<span class="s2">&quot;out rows with invalid vectors), `error` (throw an error) and &quot;</span>
<span class="s2">&quot;`optimistic` (do not check the vector size, and keep all rows). &quot;</span>
<span class="s2">&quot;`error` by default.&quot;</span><span class="p">,</span>
<span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, inputCol=None, size=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorSizeHint</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.VectorSizeHint&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VectorSizeHint.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, inputCol=None, size=None, handleInvalid=&quot;error&quot;)</span>
<span class="sd"> Sets params for this VectorSizeHint.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.getSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.getSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSize</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Gets size param, the size of vectors in `inputCol`.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">size</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot; Sets size param, the size of vectors in `inputCol`.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`inputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VectorSizeHint.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div>
<span class="k">class</span> <span class="nc">_VarianceThresholdSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`VarianceThresholdSelector` and</span>
<span class="sd"> :py:class:`VarianceThresholdSelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">varianceThreshold</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;varianceThreshold&quot;</span><span class="p">,</span>
<span class="s2">&quot;Param for variance threshold. Features with a variance not &quot;</span> <span class="o">+</span>
<span class="s2">&quot;greater than this threshold will be removed. The default value &quot;</span> <span class="o">+</span>
<span class="s2">&quot;is 0.0.&quot;</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of varianceThreshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">varianceThreshold</span><span class="p">)</span>
<div class="viewcode-block" id="VarianceThresholdSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">VarianceThresholdSelector</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Feature selector that removes all low-variance features. Features with a</span>
<span class="sd"> variance not greater than the threshold will be removed. The default is to keep</span>
<span class="sd"> all features with non-zero variance, i.e. remove the features that have the</span>
<span class="sd"> same value in all samples.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([6.0, 7.0, 0.0, 7.0, 6.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 6.0, 0.0, 5.0, 9.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 3.0, 0.0, 5.0, 5.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 9.0, 8.0, 5.0, 6.0, 4.0]),),</span>
<span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 5.0, 4.0, 4.0]),),</span>
<span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 0.0, 0.0, 0.0]),)],</span>
<span class="sd"> ... [&quot;features&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = VarianceThresholdSelector(varianceThreshold=8.2, outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> VarianceThresholdSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([6.0, 7.0, 0.0])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [0, 3, 5]</span>
<span class="sd"> &gt;&gt;&gt; varianceThresholdSelectorPath = temp_path + &quot;/variance-threshold-selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(varianceThresholdSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = VarianceThresholdSelector.load(varianceThresholdSelectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getVarianceThreshold() == selector.getVarianceThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/variance-threshold-selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = VarianceThresholdSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">varianceThreshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, varianceThreshold=0.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VarianceThresholdSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.feature.VarianceThresholdSelector&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="VarianceThresholdSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">varianceThreshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, varianceThreshold=0.0)</span>
<span class="sd"> Sets params for this VarianceThresholdSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setVarianceThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setVarianceThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`varianceThreshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">VarianceThresholdSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">VarianceThresholdSelectorModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`VarianceThresholdSelector`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="VarianceThresholdSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="VarianceThresholdSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">_UnivariateFeatureSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Params for :py:class:`UnivariateFeatureSelector` and</span>
<span class="sd"> :py:class:`UnivariateFeatureSelectorModel`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">featureType</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;featureType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The feature type. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Supported options: categorical, continuous.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">labelType</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;labelType&quot;</span><span class="p">,</span>
<span class="s2">&quot;The label type. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Supported options: categorical, continuous.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">selectionMode</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;selectionMode&quot;</span><span class="p">,</span>
<span class="s2">&quot;The selection mode. &quot;</span> <span class="o">+</span>
<span class="s2">&quot;Supported options: numTopFeatures (default), percentile, fpr, &quot;</span> <span class="o">+</span>
<span class="s2">&quot;fdr, fwe.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">)</span>
<span class="n">selectionThreshold</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;selectionThreshold&quot;</span><span class="p">,</span> <span class="s2">&quot;The upper bound of the &quot;</span> <span class="o">+</span>
<span class="s2">&quot;features that selector will select.&quot;</span><span class="p">,</span>
<span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of featureType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">featureType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of labelType or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labelType</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectionMode or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionMode</span><span class="p">)</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gets the value of selectionThreshold or its default value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionThreshold</span><span class="p">)</span>
<div class="viewcode-block" id="UnivariateFeatureSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">UnivariateFeatureSelector</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> UnivariateFeatureSelector</span>
<span class="sd"> Feature selector based on univariate statistical tests against labels. Currently, Spark</span>
<span class="sd"> supports three Univariate Feature Selectors: chi-squared, ANOVA F-test and F-value.</span>
<span class="sd"> User can choose Univariate Feature Selector by setting `featureType` and `labelType`,</span>
<span class="sd"> and Spark will pick the score function based on the specified `featureType` and `labelType`.</span>
<span class="sd"> The following combination of `featureType` and `labelType` are supported:</span>
<span class="sd"> - `featureType` `categorical` and `labelType` `categorical`, Spark uses chi-squared,</span>
<span class="sd"> i.e. chi2 in sklearn.</span>
<span class="sd"> - `featureType` `continuous` and `labelType` `categorical`, Spark uses ANOVA F-test,</span>
<span class="sd"> i.e. f_classif in sklearn.</span>
<span class="sd"> - `featureType` `continuous` and `labelType` `continuous`, Spark uses F-value,</span>
<span class="sd"> i.e. f_regression in sklearn.</span>
<span class="sd"> The `UnivariateFeatureSelector` supports different selection modes: `numTopFeatures`,</span>
<span class="sd"> `percentile`, `fpr`, `fdr`, `fwe`.</span>
<span class="sd"> - `numTopFeatures` chooses a fixed number of top features according to a according to a</span>
<span class="sd"> hypothesis.</span>
<span class="sd"> - `percentile` is similar but chooses a fraction of all features</span>
<span class="sd"> instead of a fixed number.</span>
<span class="sd"> - `fpr` chooses all features whose p-values are below a threshold,</span>
<span class="sd"> thus controlling the false positive rate of selection.</span>
<span class="sd"> - `fdr` uses the `Benjamini-Hochberg procedure \</span>
<span class="sd"> &lt;https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure&gt;`_</span>
<span class="sd"> to choose all features whose false discovery rate is below a threshold.</span>
<span class="sd"> - `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span>
<span class="sd"> 1 / `numFeatures`, thus controlling the family-wise error rate of selection.</span>
<span class="sd"> By default, the selection mode is `numTopFeatures`.</span>
<span class="sd"> .. versionadded:: 3.1.1</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.ml.linalg import Vectors</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(Vectors.dense([1.7, 4.4, 7.6, 5.8, 9.6, 2.3]), 3.0),</span>
<span class="sd"> ... (Vectors.dense([8.8, 7.3, 5.7, 7.3, 2.2, 4.1]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([1.2, 9.5, 2.5, 3.1, 8.7, 2.5]), 1.0),</span>
<span class="sd"> ... (Vectors.dense([3.7, 9.2, 6.1, 4.1, 7.5, 3.8]), 2.0),</span>
<span class="sd"> ... (Vectors.dense([8.9, 5.2, 7.8, 8.3, 5.2, 3.0]), 4.0),</span>
<span class="sd"> ... (Vectors.dense([7.9, 8.5, 9.2, 4.0, 9.4, 2.1]), 4.0)],</span>
<span class="sd"> ... [&quot;features&quot;, &quot;label&quot;])</span>
<span class="sd"> &gt;&gt;&gt; selector = UnivariateFeatureSelector(outputCol=&quot;selectedFeatures&quot;)</span>
<span class="sd"> &gt;&gt;&gt; selector.setFeatureType(&quot;continuous&quot;).setLabelType(&quot;categorical&quot;).setSelectionThreshold(1)</span>
<span class="sd"> UnivariateFeatureSelector...</span>
<span class="sd"> &gt;&gt;&gt; model = selector.fit(df)</span>
<span class="sd"> &gt;&gt;&gt; model.getFeaturesCol()</span>
<span class="sd"> &#39;features&#39;</span>
<span class="sd"> &gt;&gt;&gt; model.setFeaturesCol(&quot;features&quot;)</span>
<span class="sd"> UnivariateFeatureSelectorModel...</span>
<span class="sd"> &gt;&gt;&gt; model.transform(df).head().selectedFeatures</span>
<span class="sd"> DenseVector([7.6])</span>
<span class="sd"> &gt;&gt;&gt; model.selectedFeatures</span>
<span class="sd"> [2]</span>
<span class="sd"> &gt;&gt;&gt; selectorPath = temp_path + &quot;/selector&quot;</span>
<span class="sd"> &gt;&gt;&gt; selector.save(selectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector = UnivariateFeatureSelector.load(selectorPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedSelector.getSelectionThreshold() == selector.getSelectionThreshold()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; modelPath = temp_path + &quot;/selector-model&quot;</span>
<span class="sd"> &gt;&gt;&gt; model.save(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel = UnivariateFeatureSelectorModel.load(modelPath)</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.selectedFeatures == model.selectedFeatures</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; loadedModel.transform(df).take(1) == model.transform(df).take(1)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">selectionMode</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;label&quot;, selectionMode=&quot;numTopFeatures&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">UnivariateFeatureSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.feature.UnivariateFeatureSelector&quot;</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="UnivariateFeatureSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s2">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">labelCol</span><span class="o">=</span><span class="s2">&quot;labels&quot;</span><span class="p">,</span> <span class="n">selectionMode</span><span class="o">=</span><span class="s2">&quot;numTopFeatures&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, featuresCol=&quot;features&quot;, outputCol=None, \</span>
<span class="sd"> labelCol=&quot;labels&quot;, selectionMode=&quot;numTopFeatures&quot;)</span>
<span class="sd"> Sets params for this UnivariateFeatureSelector.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setFeatureType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeatureType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featureType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featureType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setLabelType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelType`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectionMode`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`selectionThreshold`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelector.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`labelCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">UnivariateFeatureSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">UnivariateFeatureSelectorModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Model fitted by :py:class:`UnivariateFeatureSelector`.</span>
<span class="sd"> .. versionadded:: 3.1.1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`featuresCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="UnivariateFeatureSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Sets the value of :py:attr:`outputCol`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;3.1.1&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> List of indices to select (filter).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">&quot;selectedFeatures&quot;</span><span class="p">)</span></div>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">import</span> <span class="nn">pyspark.ml.feature</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SparkSession</span>
<span class="n">globs</span> <span class="o">=</span> <span class="nb">globals</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">features</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">features</span><span class="p">)</span>
<span class="c1"># The small batch size here ensures that we see multiple batches,</span>
<span class="c1"># even in these small test examples:</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span>\
<span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[2]&quot;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;ml.feature tests&quot;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;sc&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;spark&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="n">testData</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;b&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">)],</span> <span class="mi">2</span><span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;stringIndDf&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span>
<span class="n">temp_path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;temp_path&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp_path</span>
<span class="k">try</span><span class="p">:</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">finally</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">shutil</span> <span class="kn">import</span> <span class="n">rmtree</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">rmtree</span><span class="p">(</span><span class="n">temp_path</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>