blob: d234411839c3a6ae703b3f99f6975d8fd188248a [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>StringIndexer &#8212; PySpark 3.2.2 documentation</title>
<link rel="stylesheet" href="../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/language_data.js"></script>
<script src="../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="StringIndexerModel" href="pyspark.ml.feature.StringIndexerModel.html" />
<link rel="prev" title="StopWordsRemover" href="pyspark.ml.feature.StopWordsRemover.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../index.html">
<img src="../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item active">
<a class="nav-link" href="../index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
<li class="">
<a href="../pyspark.sql.html">Spark SQL</a>
</li>
<li class="">
<a href="../pyspark.pandas/index.html">Pandas API on Spark</a>
</li>
<li class="">
<a href="../pyspark.ss.html">Structured Streaming</a>
</li>
<li class="active">
<a href="../pyspark.ml.html">MLlib (DataFrame-based)</a>
</li>
<li class="">
<a href="../pyspark.streaming.html">Spark Streaming</a>
</li>
<li class="">
<a href="../pyspark.mllib.html">MLlib (RDD-based)</a>
</li>
<li class="">
<a href="../pyspark.html">Spark Core</a>
</li>
<li class="">
<a href="../pyspark.resource.html">Resource Management</a>
</li>
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="stringindexer">
<h1>StringIndexer<a class="headerlink" href="#stringindexer" title="Permalink to this headline"></a></h1>
<dl class="py class">
<dt id="pyspark.ml.feature.StringIndexer">
<em class="property">class </em><code class="sig-prename descclassname">pyspark.ml.feature.</code><code class="sig-name descname">StringIndexer</code><span class="sig-paren">(</span><em class="sig-param"><span class="o">*</span></em>, <em class="sig-param"><span class="n">inputCol</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">outputCol</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">inputCols</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">outputCols</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">handleInvalid</span><span class="o">=</span><span class="default_value">'error'</span></em>, <em class="sig-param"><span class="n">stringOrderType</span><span class="o">=</span><span class="default_value">'frequencyDesc'</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer" title="Permalink to this definition"></a></dt>
<dd><p>A label indexer that maps a string column of labels to an ML column of label indices.
If the input column is numeric, we cast it to string and index the string values.
The indices are in [0, numLabels). By default, this is ordered by label frequencies
so the most frequent label gets index 0. The ordering behavior is controlled by
setting <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-attr docutils literal notranslate"><span class="pre">stringOrderType</span></code></a>. Its default value is ‘frequencyDesc’.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.4.0.</span></p>
</div>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;frequencyDesc&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="go">StringIndexer...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="go">StringIndexerModel...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">td</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">td</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">indexed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">inverter</span> <span class="o">=</span> <span class="n">IndexToString</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;label2&quot;</span><span class="p">,</span> <span class="n">labels</span><span class="o">=</span><span class="n">model</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">itd</span> <span class="o">=</span> <span class="n">inverter</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">itd</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">itd</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">itd</span><span class="o">.</span><span class="n">label2</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, &#39;a&#39;), (1, &#39;b&#39;), (2, &#39;c&#39;), (3, &#39;a&#39;), (4, &#39;a&#39;), (5, &#39;c&#39;)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexerPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/string-indexer&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">stringIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">stringIndexerPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedIndexer</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">()</span> <span class="o">==</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">modelPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/string-indexer-model&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">modelPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">labels</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">labels</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">indexToStringPath</span> <span class="o">=</span> <span class="n">temp_path</span> <span class="o">+</span> <span class="s2">&quot;/index-to-string&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">inverter</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">indexToStringPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedInverter</span> <span class="o">=</span> <span class="n">IndexToString</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">indexToStringPath</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedInverter</span><span class="o">.</span><span class="n">getLabels</span><span class="p">()</span> <span class="o">==</span> <span class="n">inverter</span><span class="o">.</span><span class="n">getLabels</span><span class="p">()</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">loadedModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">==</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="go">True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span><span class="o">.</span><span class="n">getStringOrderType</span><span class="p">()</span>
<span class="go">&#39;frequencyDesc&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">&quot;alphabetDesc&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">td</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">td</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">td</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">indexed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fromlabelsModel</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">from_labels</span><span class="p">([</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],</span>
<span class="gp">... </span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">&quot;label&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">&quot;error&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">fromlabelsModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">stringIndDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">indexed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span>
<span class="gp">... </span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">testData</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;e&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;f&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;e&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;f&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;f&quot;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">label1</span><span class="o">=</span><span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="n">label2</span><span class="o">=</span><span class="s2">&quot;f&quot;</span><span class="p">)],</span> <span class="mi">3</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">multiRowDf</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">inputs</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;label1&quot;</span><span class="p">,</span> <span class="s2">&quot;label2&quot;</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">outputs</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;index1&quot;</span><span class="p">,</span> <span class="s2">&quot;index2&quot;</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">inputs</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="n">outputs</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">multiRowDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">multiRowDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">index1</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">result</span><span class="o">.</span><span class="n">index2</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">fromlabelsModel</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">from_arrays_of_labels</span><span class="p">([[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;c&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;e&quot;</span><span class="p">,</span> <span class="s2">&quot;f&quot;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="n">inputCols</span><span class="o">=</span><span class="n">inputs</span><span class="p">,</span> <span class="n">outputCols</span><span class="o">=</span><span class="n">outputs</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">fromlabelsModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">multiRowDf</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">sorted</span><span class="p">(</span><span class="nb">set</span><span class="p">([(</span><span class="n">i</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">i</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">index1</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">result</span><span class="o">.</span><span class="n">index2</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()]),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="go">[(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 1.0), (5, 2.0, 1.0)]</span>
</pre></div>
</div>
<p class="rubric">Methods</p>
<table class="longtable table autosummary">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.clear" title="pyspark.ml.feature.StringIndexer.clear"><code class="xref py py-obj docutils literal notranslate"><span class="pre">clear</span></code></a>(param)</p></td>
<td><p>Clears a param from the param map if it has been explicitly set.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.copy" title="pyspark.ml.feature.StringIndexer.copy"><code class="xref py py-obj docutils literal notranslate"><span class="pre">copy</span></code></a>([extra])</p></td>
<td><p>Creates a copy of this instance with the same uid and some extra params.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.explainParam" title="pyspark.ml.feature.StringIndexer.explainParam"><code class="xref py py-obj docutils literal notranslate"><span class="pre">explainParam</span></code></a>(param)</p></td>
<td><p>Explains a single param and returns its name, doc, and optional default value and user-supplied value in a string.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.explainParams" title="pyspark.ml.feature.StringIndexer.explainParams"><code class="xref py py-obj docutils literal notranslate"><span class="pre">explainParams</span></code></a>()</p></td>
<td><p>Returns the documentation of all params with their optionally default values and user-supplied values.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.extractParamMap" title="pyspark.ml.feature.StringIndexer.extractParamMap"><code class="xref py py-obj docutils literal notranslate"><span class="pre">extractParamMap</span></code></a>([extra])</p></td>
<td><p>Extracts the embedded default param values and user-supplied values, and then merges them with extra values from input into a flat param map, where the latter value is used if there exist conflicts, i.e., with ordering: default param values &lt; user-supplied values &lt; extra.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.fit" title="pyspark.ml.feature.StringIndexer.fit"><code class="xref py py-obj docutils literal notranslate"><span class="pre">fit</span></code></a>(dataset[, params])</p></td>
<td><p>Fits a model to the input dataset with optional parameters.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.fitMultiple" title="pyspark.ml.feature.StringIndexer.fitMultiple"><code class="xref py py-obj docutils literal notranslate"><span class="pre">fitMultiple</span></code></a>(dataset, paramMaps)</p></td>
<td><p>Fits a model to the input dataset for each param map in <cite>paramMaps</cite>.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getHandleInvalid" title="pyspark.ml.feature.StringIndexer.getHandleInvalid"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getHandleInvalid</span></code></a>()</p></td>
<td><p>Gets the value of handleInvalid or its default value.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getInputCol" title="pyspark.ml.feature.StringIndexer.getInputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getInputCol</span></code></a>()</p></td>
<td><p>Gets the value of inputCol or its default value.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getInputCols" title="pyspark.ml.feature.StringIndexer.getInputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getInputCols</span></code></a>()</p></td>
<td><p>Gets the value of inputCols or its default value.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getOrDefault" title="pyspark.ml.feature.StringIndexer.getOrDefault"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getOrDefault</span></code></a>(param)</p></td>
<td><p>Gets the value of a param in the user-supplied param map or its default value.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getOutputCol" title="pyspark.ml.feature.StringIndexer.getOutputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getOutputCol</span></code></a>()</p></td>
<td><p>Gets the value of outputCol or its default value.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getOutputCols" title="pyspark.ml.feature.StringIndexer.getOutputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getOutputCols</span></code></a>()</p></td>
<td><p>Gets the value of outputCols or its default value.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getParam" title="pyspark.ml.feature.StringIndexer.getParam"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getParam</span></code></a>(paramName)</p></td>
<td><p>Gets a param by its name.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.getStringOrderType" title="pyspark.ml.feature.StringIndexer.getStringOrderType"><code class="xref py py-obj docutils literal notranslate"><span class="pre">getStringOrderType</span></code></a>()</p></td>
<td><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-attr docutils literal notranslate"><span class="pre">stringOrderType</span></code></a> or its default value ‘frequencyDesc’.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.hasDefault" title="pyspark.ml.feature.StringIndexer.hasDefault"><code class="xref py py-obj docutils literal notranslate"><span class="pre">hasDefault</span></code></a>(param)</p></td>
<td><p>Checks whether a param has a default value.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.hasParam" title="pyspark.ml.feature.StringIndexer.hasParam"><code class="xref py py-obj docutils literal notranslate"><span class="pre">hasParam</span></code></a>(paramName)</p></td>
<td><p>Tests whether this instance contains a param with a given (string) name.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.isDefined" title="pyspark.ml.feature.StringIndexer.isDefined"><code class="xref py py-obj docutils literal notranslate"><span class="pre">isDefined</span></code></a>(param)</p></td>
<td><p>Checks whether a param is explicitly set by user or has a default value.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.isSet" title="pyspark.ml.feature.StringIndexer.isSet"><code class="xref py py-obj docutils literal notranslate"><span class="pre">isSet</span></code></a>(param)</p></td>
<td><p>Checks whether a param is explicitly set by user.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.load" title="pyspark.ml.feature.StringIndexer.load"><code class="xref py py-obj docutils literal notranslate"><span class="pre">load</span></code></a>(path)</p></td>
<td><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.read" title="pyspark.ml.feature.StringIndexer.read"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read</span></code></a>()</p></td>
<td><p>Returns an MLReader instance for this class.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.save" title="pyspark.ml.feature.StringIndexer.save"><code class="xref py py-obj docutils literal notranslate"><span class="pre">save</span></code></a>(path)</p></td>
<td><p>Save this ML instance to the given path, a shortcut of ‘write().save(path)’.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.set" title="pyspark.ml.feature.StringIndexer.set"><code class="xref py py-obj docutils literal notranslate"><span class="pre">set</span></code></a>(param, value)</p></td>
<td><p>Sets a parameter in the embedded param map.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setHandleInvalid" title="pyspark.ml.feature.StringIndexer.setHandleInvalid"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setHandleInvalid</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="pyspark.ml.feature.StringIndexer.handleInvalid"><code class="xref py py-attr docutils literal notranslate"><span class="pre">handleInvalid</span></code></a>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setInputCol" title="pyspark.ml.feature.StringIndexer.setInputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setInputCol</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCol" title="pyspark.ml.feature.StringIndexer.inputCol"><code class="xref py py-attr docutils literal notranslate"><span class="pre">inputCol</span></code></a>.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setInputCols" title="pyspark.ml.feature.StringIndexer.setInputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setInputCols</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCols" title="pyspark.ml.feature.StringIndexer.inputCols"><code class="xref py py-attr docutils literal notranslate"><span class="pre">inputCols</span></code></a>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setOutputCol" title="pyspark.ml.feature.StringIndexer.setOutputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setOutputCol</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCol" title="pyspark.ml.feature.StringIndexer.outputCol"><code class="xref py py-attr docutils literal notranslate"><span class="pre">outputCol</span></code></a>.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setOutputCols" title="pyspark.ml.feature.StringIndexer.setOutputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setOutputCols</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCols" title="pyspark.ml.feature.StringIndexer.outputCols"><code class="xref py py-attr docutils literal notranslate"><span class="pre">outputCols</span></code></a>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setParams" title="pyspark.ml.feature.StringIndexer.setParams"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setParams</span></code></a>(self, \*[, inputCol, outputCol, …])</p></td>
<td><p>Sets params for this StringIndexer.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.setStringOrderType" title="pyspark.ml.feature.StringIndexer.setStringOrderType"><code class="xref py py-obj docutils literal notranslate"><span class="pre">setStringOrderType</span></code></a>(value)</p></td>
<td><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-attr docutils literal notranslate"><span class="pre">stringOrderType</span></code></a>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.write" title="pyspark.ml.feature.StringIndexer.write"><code class="xref py py-obj docutils literal notranslate"><span class="pre">write</span></code></a>()</p></td>
<td><p>Returns an MLWriter instance for this ML instance.</p></td>
</tr>
</tbody>
</table>
<p class="rubric">Attributes</p>
<table class="longtable table autosummary">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="pyspark.ml.feature.StringIndexer.handleInvalid"><code class="xref py py-obj docutils literal notranslate"><span class="pre">handleInvalid</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCol" title="pyspark.ml.feature.StringIndexer.inputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">inputCol</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCols" title="pyspark.ml.feature.StringIndexer.inputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">inputCols</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCol" title="pyspark.ml.feature.StringIndexer.outputCol"><code class="xref py py-obj docutils literal notranslate"><span class="pre">outputCol</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCols" title="pyspark.ml.feature.StringIndexer.outputCols"><code class="xref py py-obj docutils literal notranslate"><span class="pre">outputCols</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.params" title="pyspark.ml.feature.StringIndexer.params"><code class="xref py py-obj docutils literal notranslate"><span class="pre">params</span></code></a></p></td>
<td><p>Returns all params ordered by name.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-obj docutils literal notranslate"><span class="pre">stringOrderType</span></code></a></p></td>
<td><p></p></td>
</tr>
</tbody>
</table>
<p class="rubric">Methods Documentation</p>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.clear">
<code class="sig-name descname">clear</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.clear" title="Permalink to this definition"></a></dt>
<dd><p>Clears a param from the param map if it has been explicitly set.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.copy">
<code class="sig-name descname">copy</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">extra</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.copy" title="Permalink to this definition"></a></dt>
<dd><p>Creates a copy of this instance with the same uid and some
extra params. This implementation first calls Params.copy and
then make a copy of the companion Java pipeline component with
extra params. So both the Python wrapper and the Java pipeline
component get copied.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl class="simple">
<dt><strong>extra</strong><span class="classifier">dict, optional</span></dt><dd><p>Extra parameters to copy to the new instance</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt><code class="xref py py-class docutils literal notranslate"><span class="pre">JavaParams</span></code></dt><dd><p>Copy of this instance</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.explainParam">
<code class="sig-name descname">explainParam</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.explainParam" title="Permalink to this definition"></a></dt>
<dd><p>Explains a single param and returns its name, doc, and optional
default value and user-supplied value in a string.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.explainParams">
<code class="sig-name descname">explainParams</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.explainParams" title="Permalink to this definition"></a></dt>
<dd><p>Returns the documentation of all params with their optionally
default values and user-supplied values.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.extractParamMap">
<code class="sig-name descname">extractParamMap</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">extra</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.extractParamMap" title="Permalink to this definition"></a></dt>
<dd><p>Extracts the embedded default param values and user-supplied
values, and then merges them with extra values from input into
a flat param map, where the latter value is used if there exist
conflicts, i.e., with ordering: default param values &lt;
user-supplied values &lt; extra.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl class="simple">
<dt><strong>extra</strong><span class="classifier">dict, optional</span></dt><dd><p>extra param values</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt>dict</dt><dd><p>merged param map</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.fit">
<code class="sig-name descname">fit</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dataset</span></em>, <em class="sig-param"><span class="n">params</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.fit" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset with optional parameters.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.3.0.</span></p>
</div>
<dl class="field-list">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl>
<dt><strong>dataset</strong><span class="classifier"><a class="reference internal" href="pyspark.sql.DataFrame.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.DataFrame</span></code></a></span></dt><dd><p>input dataset.</p>
</dd>
<dt><strong>params</strong><span class="classifier">dict or list or tuple, optional</span></dt><dd><p>an optional param map that overrides embedded params. If a list/tuple of
param maps is given, this calls fit on each param map and returns a list of
models.</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt><code class="xref py py-class docutils literal notranslate"><span class="pre">Transformer</span></code> or a list of <code class="xref py py-class docutils literal notranslate"><span class="pre">Transformer</span></code></dt><dd><p>fitted model(s)</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.fitMultiple">
<code class="sig-name descname">fitMultiple</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">dataset</span></em>, <em class="sig-param"><span class="n">paramMaps</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.fitMultiple" title="Permalink to this definition"></a></dt>
<dd><p>Fits a model to the input dataset for each param map in <cite>paramMaps</cite>.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 2.3.0.</span></p>
</div>
<dl class="field-list">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl>
<dt><strong>dataset</strong><span class="classifier"><a class="reference internal" href="pyspark.sql.DataFrame.html#pyspark.sql.DataFrame" title="pyspark.sql.DataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.DataFrame</span></code></a></span></dt><dd><p>input dataset.</p>
</dd>
<dt><strong>paramMaps</strong><span class="classifier"><code class="xref py py-class docutils literal notranslate"><span class="pre">collections.abc.Sequence</span></code></span></dt><dd><p>A Sequence of param maps.</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt><code class="xref py py-class docutils literal notranslate"><span class="pre">_FitMultipleIterator</span></code></dt><dd><p>A thread safe iterable which contains one model for each param map. Each
call to <cite>next(modelIterator)</cite> will return <cite>(index, model)</cite> where model was fit
using <cite>paramMaps[index]</cite>. <cite>index</cite> values may not be sequential.</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getHandleInvalid">
<code class="sig-name descname">getHandleInvalid</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of handleInvalid or its default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getInputCol">
<code class="sig-name descname">getInputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCol or its default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getInputCols">
<code class="sig-name descname">getInputCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of inputCols or its default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getOrDefault">
<code class="sig-name descname">getOrDefault</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getOrDefault" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of a param in the user-supplied param map or its
default value. Raises an error if neither is set.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getOutputCol">
<code class="sig-name descname">getOutputCol</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCol or its default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getOutputCols">
<code class="sig-name descname">getOutputCols</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getOutputCols" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of outputCols or its default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getParam">
<code class="sig-name descname">getParam</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">paramName</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getParam" title="Permalink to this definition"></a></dt>
<dd><p>Gets a param by its name.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.getStringOrderType">
<code class="sig-name descname">getStringOrderType</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.getStringOrderType" title="Permalink to this definition"></a></dt>
<dd><p>Gets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-attr docutils literal notranslate"><span class="pre">stringOrderType</span></code></a> or its default value ‘frequencyDesc’.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 2.3.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.hasDefault">
<code class="sig-name descname">hasDefault</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.hasDefault" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param has a default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.hasParam">
<code class="sig-name descname">hasParam</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">paramName</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.hasParam" title="Permalink to this definition"></a></dt>
<dd><p>Tests whether this instance contains a param with a given
(string) name.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.isDefined">
<code class="sig-name descname">isDefined</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.isDefined" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user or has
a default value.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.isSet">
<code class="sig-name descname">isSet</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.isSet" title="Permalink to this definition"></a></dt>
<dd><p>Checks whether a param is explicitly set by user.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.load">
<em class="property">classmethod </em><code class="sig-name descname">load</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">path</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.load" title="Permalink to this definition"></a></dt>
<dd><p>Reads an ML instance from the input path, a shortcut of <cite>read().load(path)</cite>.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.read">
<em class="property">classmethod </em><code class="sig-name descname">read</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.read" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLReader instance for this class.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.save">
<code class="sig-name descname">save</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">path</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.save" title="Permalink to this definition"></a></dt>
<dd><p>Save this ML instance to the given path, a shortcut of ‘write().save(path)’.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.set">
<code class="sig-name descname">set</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">param</span></em>, <em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.set" title="Permalink to this definition"></a></dt>
<dd><p>Sets a parameter in the embedded param map.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setHandleInvalid">
<code class="sig-name descname">setHandleInvalid</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setHandleInvalid"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setHandleInvalid" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="pyspark.ml.feature.StringIndexer.handleInvalid"><code class="xref py py-attr docutils literal notranslate"><span class="pre">handleInvalid</span></code></a>.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setInputCol">
<code class="sig-name descname">setInputCol</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setInputCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setInputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCol" title="pyspark.ml.feature.StringIndexer.inputCol"><code class="xref py py-attr docutils literal notranslate"><span class="pre">inputCol</span></code></a>.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setInputCols">
<code class="sig-name descname">setInputCols</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setInputCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setInputCols" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.inputCols" title="pyspark.ml.feature.StringIndexer.inputCols"><code class="xref py py-attr docutils literal notranslate"><span class="pre">inputCols</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 3.0.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setOutputCol">
<code class="sig-name descname">setOutputCol</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setOutputCol"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setOutputCol" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCol" title="pyspark.ml.feature.StringIndexer.outputCol"><code class="xref py py-attr docutils literal notranslate"><span class="pre">outputCol</span></code></a>.</p>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setOutputCols">
<code class="sig-name descname">setOutputCols</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setOutputCols"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setOutputCols" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.outputCols" title="pyspark.ml.feature.StringIndexer.outputCols"><code class="xref py py-attr docutils literal notranslate"><span class="pre">outputCols</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 3.0.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setParams">
<code class="sig-name descname">setParams</code><span class="sig-paren">(</span><em class="sig-param">self</em>, <em class="sig-param">\*</em>, <em class="sig-param">inputCol=None</em>, <em class="sig-param">outputCol=None</em>, <em class="sig-param">inputCols=None</em>, <em class="sig-param">outputCols=None</em>, <em class="sig-param">handleInvalid=&quot;error&quot;</em>, <em class="sig-param">stringOrderType=&quot;frequencyDesc&quot;</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setParams"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setParams" title="Permalink to this definition"></a></dt>
<dd><p>Sets params for this StringIndexer.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 1.4.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.setStringOrderType">
<code class="sig-name descname">setStringOrderType</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">value</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/ml/feature.html#StringIndexer.setStringOrderType"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.setStringOrderType" title="Permalink to this definition"></a></dt>
<dd><p>Sets the value of <a class="reference internal" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="pyspark.ml.feature.StringIndexer.stringOrderType"><code class="xref py py-attr docutils literal notranslate"><span class="pre">stringOrderType</span></code></a>.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 2.3.0.</span></p>
</div>
</dd></dl>
<dl class="py method">
<dt id="pyspark.ml.feature.StringIndexer.write">
<code class="sig-name descname">write</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.write" title="Permalink to this definition"></a></dt>
<dd><p>Returns an MLWriter instance for this ML instance.</p>
</dd></dl>
<p class="rubric">Attributes Documentation</p>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.handleInvalid">
<code class="sig-name descname">handleInvalid</code><em class="property"> = Param(parent='undefined', name='handleInvalid', doc=&quot;how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels).&quot;)</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.handleInvalid" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.inputCol">
<code class="sig-name descname">inputCol</code><em class="property"> = Param(parent='undefined', name='inputCol', doc='input column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.inputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.inputCols">
<code class="sig-name descname">inputCols</code><em class="property"> = Param(parent='undefined', name='inputCols', doc='input column names.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.inputCols" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.outputCol">
<code class="sig-name descname">outputCol</code><em class="property"> = Param(parent='undefined', name='outputCol', doc='output column name.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.outputCol" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.outputCols">
<code class="sig-name descname">outputCols</code><em class="property"> = Param(parent='undefined', name='outputCols', doc='output column names.')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.outputCols" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.params">
<code class="sig-name descname">params</code><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.params" title="Permalink to this definition"></a></dt>
<dd><p>Returns all params ordered by name. The default implementation
uses <code class="xref py py-func docutils literal notranslate"><span class="pre">dir()</span></code> to get all attributes of type
<code class="xref py py-class docutils literal notranslate"><span class="pre">Param</span></code>.</p>
</dd></dl>
<dl class="py attribute">
<dt id="pyspark.ml.feature.StringIndexer.stringOrderType">
<code class="sig-name descname">stringOrderType</code><em class="property"> = Param(parent='undefined', name='stringOrderType', doc='How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. Default is frequencyDesc. In case of equal frequency when under frequencyDesc/Asc, the strings are further sorted alphabetically')</em><a class="headerlink" href="#pyspark.ml.feature.StringIndexer.stringOrderType" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
</div>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="pyspark.ml.feature.StopWordsRemover.html" title="previous page">StopWordsRemover</a>
<a class='right-next' id="next-link" href="pyspark.ml.feature.StringIndexerModel.html" title="next page">StringIndexerModel</a>
</div>
</main>
</div>
</div>
<script src="../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>