blob: 9353c157ac051c87187538f61df4847fd950f733 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.ml.pipeline &#8212; PySpark master documentation</title>
<link href="../../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../../../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "tex2jax_ignore|mathjax_ignore|document", "processClass": "tex2jax_process|mathjax_process|math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/pipeline.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../index.html">
Overview
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../development/index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
3.5.1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/ml/pipeline.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.ml.pipeline</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Type</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">cast</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">keyword_only</span><span class="p">,</span> <span class="n">since</span><span class="p">,</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.base</span> <span class="kn">import</span> <span class="n">Estimator</span><span class="p">,</span> <span class="n">Model</span><span class="p">,</span> <span class="n">Transformer</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.param</span> <span class="kn">import</span> <span class="n">Param</span><span class="p">,</span> <span class="n">Params</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">MLReadable</span><span class="p">,</span>
<span class="n">MLWritable</span><span class="p">,</span>
<span class="n">JavaMLWriter</span><span class="p">,</span>
<span class="n">JavaMLReader</span><span class="p">,</span>
<span class="n">DefaultParamsReader</span><span class="p">,</span>
<span class="n">DefaultParamsWriter</span><span class="p">,</span>
<span class="n">MLWriter</span><span class="p">,</span>
<span class="n">MLReader</span><span class="p">,</span>
<span class="n">JavaMLReadable</span><span class="p">,</span>
<span class="n">JavaMLWritable</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaParams</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.ml._typing</span> <span class="kn">import</span> <span class="n">ParamMap</span><span class="p">,</span> <span class="n">PipelineStage</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span>
<div class="viewcode-block" id="Pipeline"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">Pipeline</span><span class="p">(</span><span class="n">Estimator</span><span class="p">[</span><span class="s2">&quot;PipelineModel&quot;</span><span class="p">],</span> <span class="n">MLReadable</span><span class="p">[</span><span class="s2">&quot;Pipeline&quot;</span><span class="p">],</span> <span class="n">MLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A simple pipeline, which acts as an estimator. A Pipeline consists</span>
<span class="sd"> of a sequence of stages, each of which is either an</span>
<span class="sd"> :py:class:`Estimator` or a :py:class:`Transformer`. When</span>
<span class="sd"> :py:meth:`Pipeline.fit` is called, the stages are executed in</span>
<span class="sd"> order. If a stage is an :py:class:`Estimator`, its</span>
<span class="sd"> :py:meth:`Estimator.fit` method will be called on the input</span>
<span class="sd"> dataset to fit a model. Then the model, which is a transformer,</span>
<span class="sd"> will be used to transform the dataset as the input to the next</span>
<span class="sd"> stage. If a stage is a :py:class:`Transformer`, its</span>
<span class="sd"> :py:meth:`Transformer.transform` method will be called to produce</span>
<span class="sd"> the dataset for the next stage. The fitted model from a</span>
<span class="sd"> :py:class:`Pipeline` is a :py:class:`PipelineModel`, which</span>
<span class="sd"> consists of fitted models and transformers, corresponding to the</span>
<span class="sd"> pipeline stages. If stages is an empty list, the pipeline acts as an</span>
<span class="sd"> identity transformer.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stages</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span>
<span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">&quot;stages&quot;</span><span class="p">,</span> <span class="s2">&quot;a list of pipeline stages&quot;</span>
<span class="p">)</span>
<span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">stages</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __init__(self, \\*, stages=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Pipeline</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<div class="viewcode-block" id="Pipeline.setStages"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.setStages">[docs]</a> <span class="k">def</span> <span class="nf">setStages</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;Pipeline&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Set pipeline stages.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> value : list</span>
<span class="sd"> of :py:class:`pyspark.ml.Transformer`</span>
<span class="sd"> or :py:class:`pyspark.ml.Estimator`</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`Pipeline`</span>
<span class="sd"> the pipeline instance</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stages</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="Pipeline.getStages"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.getStages">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getStages</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Get pipeline stages.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">)</span></div>
<div class="viewcode-block" id="Pipeline.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.setParams">[docs]</a> <span class="nd">@keyword_only</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">stages</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Pipeline&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> setParams(self, \\*, stages=None)</span>
<span class="sd"> Sets params for Pipeline.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PipelineModel&quot;</span><span class="p">:</span>
<span class="n">stages</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getStages</span><span class="p">()</span>
<span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="n">stages</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">Estimator</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">Transformer</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Cannot recognize a pipeline stage of type </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">stage</span><span class="p">))</span>
<span class="n">indexOfLastEstimator</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">stage</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">stages</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">Estimator</span><span class="p">):</span>
<span class="n">indexOfLastEstimator</span> <span class="o">=</span> <span class="n">i</span>
<span class="n">transformers</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Transformer</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">stage</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">stages</span><span class="p">):</span>
<span class="k">if</span> <span class="n">i</span> <span class="o">&lt;=</span> <span class="n">indexOfLastEstimator</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">Transformer</span><span class="p">):</span>
<span class="n">transformers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">stage</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">stage</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span> <span class="c1"># must be an Estimator</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">stage</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="n">transformers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="k">if</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">indexOfLastEstimator</span><span class="p">:</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">transformers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Transformer</span><span class="p">,</span> <span class="n">stage</span><span class="p">))</span>
<span class="k">return</span> <span class="n">PipelineModel</span><span class="p">(</span><span class="n">transformers</span><span class="p">)</span>
<div class="viewcode-block" id="Pipeline.copy"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.copy">[docs]</a> <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">extra</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ParamMap&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Pipeline&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a copy of this instance.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> extra : dict, optional</span>
<span class="sd"> extra parameters</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :py:class:`Pipeline`</span>
<span class="sd"> new instance</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">extra</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">extra</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">()</span>
<span class="n">that</span> <span class="o">=</span> <span class="n">Params</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">extra</span><span class="p">)</span>
<span class="n">stages</span> <span class="o">=</span> <span class="p">[</span><span class="n">stage</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">extra</span><span class="p">)</span> <span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="n">that</span><span class="o">.</span><span class="n">getStages</span><span class="p">()]</span>
<span class="k">return</span> <span class="n">that</span><span class="o">.</span><span class="n">setStages</span><span class="p">(</span><span class="n">stages</span><span class="p">)</span></div>
<div class="viewcode-block" id="Pipeline.write"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.write">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">MLWriter</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns an MLWriter instance for this ML instance.&quot;&quot;&quot;</span>
<span class="n">allStagesAreJava</span> <span class="o">=</span> <span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">checkStagesForJava</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">getStages</span><span class="p">())</span>
<span class="k">if</span> <span class="n">allStagesAreJava</span><span class="p">:</span>
<span class="k">return</span> <span class="n">JavaMLWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="k">return</span> <span class="n">PipelineWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span></div>
<div class="viewcode-block" id="Pipeline.read"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.Pipeline.html#pyspark.ml.Pipeline.read">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PipelineReader&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns an MLReader instance for this class.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">PipelineReader</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span></div>
<span class="nd">@classmethod</span>
<span class="k">def</span> <span class="nf">_from_java</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">java_stage</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Pipeline&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Given a Java Pipeline, create and return a Python wrapper of it.</span>
<span class="sd"> Used for ML persistence.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Create a new instance of this stage.</span>
<span class="n">py_stage</span> <span class="o">=</span> <span class="bp">cls</span><span class="p">()</span>
<span class="c1"># Load information from java_stage to the instance.</span>
<span class="n">py_stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">JavaParams</span><span class="o">.</span><span class="n">_from_java</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">java_stage</span><span class="o">.</span><span class="n">getStages</span><span class="p">()</span>
<span class="p">]</span>
<span class="n">py_stage</span><span class="o">.</span><span class="n">setStages</span><span class="p">(</span><span class="n">py_stages</span><span class="p">)</span>
<span class="n">py_stage</span><span class="o">.</span><span class="n">_resetUid</span><span class="p">(</span><span class="n">java_stage</span><span class="o">.</span><span class="n">uid</span><span class="p">())</span>
<span class="k">return</span> <span class="n">py_stage</span>
<span class="k">def</span> <span class="nf">_to_java</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Transfer this instance to a Java Pipeline. Used for ML persistence.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> py4j.java_gateway.JavaObject</span>
<span class="sd"> Java object equivalent to this instance.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">gateway</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_gateway</span>
<span class="k">assert</span> <span class="n">gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="bp">cls</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">PipelineStage</span>
<span class="n">java_stages</span> <span class="o">=</span> <span class="n">gateway</span><span class="o">.</span><span class="n">new_array</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">getStages</span><span class="p">()))</span>
<span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">stage</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">getStages</span><span class="p">()):</span>
<span class="n">java_stages</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">stage</span><span class="p">)</span><span class="o">.</span><span class="n">_to_java</span><span class="p">()</span>
<span class="n">_java_obj</span> <span class="o">=</span> <span class="n">JavaParams</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">&quot;org.apache.spark.ml.Pipeline&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">_java_obj</span><span class="o">.</span><span class="n">setStages</span><span class="p">(</span><span class="n">java_stages</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_java_obj</span></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineWriter</span><span class="p">(</span><span class="n">MLWriter</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> (Private) Specialization of :py:class:`MLWriter` for :py:class:`Pipeline` types</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">instance</span><span class="p">:</span> <span class="n">Pipeline</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PipelineWriter</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">instance</span> <span class="o">=</span> <span class="n">instance</span>
<span class="k">def</span> <span class="nf">saveImpl</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">stages</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">instance</span><span class="o">.</span><span class="n">getStages</span><span class="p">()</span>
<span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">validateStages</span><span class="p">(</span><span class="n">stages</span><span class="p">)</span>
<span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">saveImpl</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">instance</span><span class="p">,</span> <span class="n">stages</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineReader</span><span class="p">(</span><span class="n">MLReader</span><span class="p">[</span><span class="n">Pipeline</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> (Private) Specialization of :py:class:`MLReader` for :py:class:`Pipeline` types</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="bp">cls</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="n">Pipeline</span><span class="p">]):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PipelineReader</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cls</span> <span class="o">=</span> <span class="bp">cls</span>
<span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Pipeline</span><span class="p">:</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">DefaultParamsReader</span><span class="o">.</span><span class="n">loadMetadata</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;language&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;paramMap&quot;</span><span class="p">]</span> <span class="ow">or</span> <span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;paramMap&quot;</span><span class="p">][</span><span class="s2">&quot;language&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;Python&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">JavaMLReader</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Type</span><span class="p">[</span><span class="s2">&quot;JavaMLReadable[Pipeline]&quot;</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">cls</span><span class="p">))</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">uid</span><span class="p">,</span> <span class="n">stages</span> <span class="o">=</span> <span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">metadata</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Pipeline</span><span class="p">(</span><span class="n">stages</span><span class="o">=</span><span class="n">stages</span><span class="p">)</span><span class="o">.</span><span class="n">_resetUid</span><span class="p">(</span><span class="n">uid</span><span class="p">)</span>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineModelWriter</span><span class="p">(</span><span class="n">MLWriter</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> (Private) Specialization of :py:class:`MLWriter` for :py:class:`PipelineModel` types</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">instance</span><span class="p">:</span> <span class="s2">&quot;PipelineModel&quot;</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PipelineModelWriter</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">instance</span> <span class="o">=</span> <span class="n">instance</span>
<span class="k">def</span> <span class="nf">saveImpl</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">stages</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">instance</span><span class="o">.</span><span class="n">stages</span>
<span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">validateStages</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">],</span> <span class="n">stages</span><span class="p">))</span>
<span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">saveImpl</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">instance</span><span class="p">,</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">],</span> <span class="n">stages</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span>
<span class="p">)</span>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineModelReader</span><span class="p">(</span><span class="n">MLReader</span><span class="p">[</span><span class="s2">&quot;PipelineModel&quot;</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> (Private) Specialization of :py:class:`MLReader` for :py:class:`PipelineModel` types</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="bp">cls</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="s2">&quot;PipelineModel&quot;</span><span class="p">]):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PipelineModelReader</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cls</span> <span class="o">=</span> <span class="bp">cls</span>
<span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PipelineModel&quot;</span><span class="p">:</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">DefaultParamsReader</span><span class="o">.</span><span class="n">loadMetadata</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;language&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;paramMap&quot;</span><span class="p">]</span> <span class="ow">or</span> <span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;paramMap&quot;</span><span class="p">][</span><span class="s2">&quot;language&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;Python&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">JavaMLReader</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Type</span><span class="p">[</span><span class="s2">&quot;JavaMLReadable[PipelineModel]&quot;</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">cls</span><span class="p">))</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">uid</span><span class="p">,</span> <span class="n">stages</span> <span class="o">=</span> <span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">metadata</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">sc</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="n">PipelineModel</span><span class="p">(</span><span class="n">stages</span><span class="o">=</span><span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">Transformer</span><span class="p">],</span> <span class="n">stages</span><span class="p">))</span><span class="o">.</span><span class="n">_resetUid</span><span class="p">(</span><span class="n">uid</span><span class="p">)</span>
<div class="viewcode-block" id="PipelineModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.PipelineModel.html#pyspark.ml.PipelineModel">[docs]</a><span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineModel</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="n">MLReadable</span><span class="p">[</span><span class="s2">&quot;PipelineModel&quot;</span><span class="p">],</span> <span class="n">MLWritable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Represents a compiled pipeline with transformers and fitted models.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Transformer</span><span class="p">]):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PipelineModel</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">stages</span> <span class="o">=</span> <span class="n">stages</span>
<span class="k">def</span> <span class="nf">_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">:</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">t</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="k">return</span> <span class="n">dataset</span>
<div class="viewcode-block" id="PipelineModel.copy"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.PipelineModel.html#pyspark.ml.PipelineModel.copy">[docs]</a> <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">extra</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ParamMap&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PipelineModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a copy of this instance.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> :param extra: extra parameters</span>
<span class="sd"> :returns: new instance</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">extra</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">extra</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">()</span>
<span class="n">stages</span> <span class="o">=</span> <span class="p">[</span><span class="n">stage</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">extra</span><span class="p">)</span> <span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">]</span>
<span class="k">return</span> <span class="n">PipelineModel</span><span class="p">(</span><span class="n">stages</span><span class="p">)</span></div>
<div class="viewcode-block" id="PipelineModel.write"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.PipelineModel.html#pyspark.ml.PipelineModel.write">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">MLWriter</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns an MLWriter instance for this ML instance.&quot;&quot;&quot;</span>
<span class="n">allStagesAreJava</span> <span class="o">=</span> <span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">checkStagesForJava</span><span class="p">(</span>
<span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">allStagesAreJava</span><span class="p">:</span>
<span class="k">return</span> <span class="n">JavaMLWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="k">return</span> <span class="n">PipelineModelWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span></div>
<div class="viewcode-block" id="PipelineModel.read"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.PipelineModel.html#pyspark.ml.PipelineModel.read">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@since</span><span class="p">(</span><span class="s2">&quot;2.0.0&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">PipelineModelReader</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns an MLReader instance for this class.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">PipelineModelReader</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span></div>
<span class="nd">@classmethod</span>
<span class="k">def</span> <span class="nf">_from_java</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">java_stage</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PipelineModel&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Given a Java PipelineModel, create and return a Python wrapper of it.</span>
<span class="sd"> Used for ML persistence.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Load information from java_stage to the instance.</span>
<span class="n">py_stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Transformer</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">JavaParams</span><span class="o">.</span><span class="n">_from_java</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">java_stage</span><span class="o">.</span><span class="n">stages</span><span class="p">()]</span>
<span class="c1"># Create a new instance of this stage.</span>
<span class="n">py_stage</span> <span class="o">=</span> <span class="bp">cls</span><span class="p">(</span><span class="n">py_stages</span><span class="p">)</span>
<span class="n">py_stage</span><span class="o">.</span><span class="n">_resetUid</span><span class="p">(</span><span class="n">java_stage</span><span class="o">.</span><span class="n">uid</span><span class="p">())</span>
<span class="k">return</span> <span class="n">py_stage</span>
<span class="k">def</span> <span class="nf">_to_java</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Transfer this instance to a Java PipelineModel. Used for ML persistence.</span>
<span class="sd"> :return: Java object equivalent to this instance.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">gateway</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_gateway</span>
<span class="k">assert</span> <span class="n">gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="bp">cls</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">Transformer</span>
<span class="n">java_stages</span> <span class="o">=</span> <span class="n">gateway</span><span class="o">.</span><span class="n">new_array</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">))</span>
<span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">stage</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stages</span><span class="p">):</span>
<span class="n">java_stages</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">stage</span><span class="p">)</span><span class="o">.</span><span class="n">_to_java</span><span class="p">()</span>
<span class="n">_java_obj</span> <span class="o">=</span> <span class="n">JavaParams</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s2">&quot;org.apache.spark.ml.PipelineModel&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">,</span> <span class="n">java_stages</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_java_obj</span></div>
<span class="nd">@inherit_doc</span>
<span class="k">class</span> <span class="nc">PipelineSharedReadWrite</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Functions for :py:class:`MLReader` and :py:class:`MLWriter` shared between</span>
<span class="sd"> :py:class:`Pipeline` and :py:class:`PipelineModel`</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">checkStagesForJava</span><span class="p">(</span><span class="n">stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">)</span> <span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="n">stages</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">validateStages</span><span class="p">(</span><span class="n">stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check that all stages are Writable</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="n">stages</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">stage</span><span class="p">,</span> <span class="n">MLWritable</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Pipeline write will fail on this pipeline &quot;</span>
<span class="o">+</span> <span class="s2">&quot;because stage </span><span class="si">%s</span><span class="s2"> of type </span><span class="si">%s</span><span class="s2"> is not MLWritable&quot;</span><span class="p">,</span>
<span class="n">stage</span><span class="o">.</span><span class="n">uid</span><span class="p">,</span>
<span class="nb">type</span><span class="p">(</span><span class="n">stage</span><span class="p">),</span>
<span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">saveImpl</span><span class="p">(</span>
<span class="n">instance</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Pipeline</span><span class="p">,</span> <span class="n">PipelineModel</span><span class="p">],</span>
<span class="n">stages</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">],</span>
<span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Save metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`</span>
<span class="sd"> - save metadata to path/metadata</span>
<span class="sd"> - save stages to stages/IDX_UID</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stageUids</span> <span class="o">=</span> <span class="p">[</span><span class="n">stage</span><span class="o">.</span><span class="n">uid</span> <span class="k">for</span> <span class="n">stage</span> <span class="ow">in</span> <span class="n">stages</span><span class="p">]</span>
<span class="n">jsonParams</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;stageUids&quot;</span><span class="p">:</span> <span class="n">stageUids</span><span class="p">,</span> <span class="s2">&quot;language&quot;</span><span class="p">:</span> <span class="s2">&quot;Python&quot;</span><span class="p">}</span>
<span class="n">DefaultParamsWriter</span><span class="o">.</span><span class="n">saveMetadata</span><span class="p">(</span><span class="n">instance</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">sc</span><span class="p">,</span> <span class="n">paramMap</span><span class="o">=</span><span class="n">jsonParams</span><span class="p">)</span>
<span class="n">stagesDir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="s2">&quot;stages&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">stage</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">stages</span><span class="p">):</span>
<span class="n">cast</span><span class="p">(</span><span class="n">MLWritable</span><span class="p">,</span> <span class="n">stage</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="p">()</span><span class="o">.</span><span class="n">save</span><span class="p">(</span>
<span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">getStagePath</span><span class="p">(</span><span class="n">stage</span><span class="o">.</span><span class="n">uid</span><span class="p">,</span> <span class="n">index</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">stages</span><span class="p">),</span> <span class="n">stagesDir</span><span class="p">)</span>
<span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">load</span><span class="p">(</span>
<span class="n">metadata</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PipelineStage&quot;</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Load metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> tuple</span>
<span class="sd"> (UID, list of stages)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stagesDir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="s2">&quot;stages&quot;</span><span class="p">)</span>
<span class="n">stageUids</span> <span class="o">=</span> <span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;paramMap&quot;</span><span class="p">][</span><span class="s2">&quot;stageUids&quot;</span><span class="p">]</span>
<span class="n">stages</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">stageUid</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">stageUids</span><span class="p">):</span>
<span class="n">stagePath</span> <span class="o">=</span> <span class="n">PipelineSharedReadWrite</span><span class="o">.</span><span class="n">getStagePath</span><span class="p">(</span>
<span class="n">stageUid</span><span class="p">,</span> <span class="n">index</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">stageUids</span><span class="p">),</span> <span class="n">stagesDir</span>
<span class="p">)</span>
<span class="n">stage</span><span class="p">:</span> <span class="s2">&quot;PipelineStage&quot;</span> <span class="o">=</span> <span class="n">DefaultParamsReader</span><span class="o">.</span><span class="n">loadParamsInstance</span><span class="p">(</span><span class="n">stagePath</span><span class="p">,</span> <span class="n">sc</span><span class="p">)</span>
<span class="n">stages</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">stage</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;uid&quot;</span><span class="p">],</span> <span class="n">stages</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">getStagePath</span><span class="p">(</span><span class="n">stageUid</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">stageIdx</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">numStages</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">stagesDir</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Get path for saving the given stage.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">stageIdxDigits</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">numStages</span><span class="p">))</span>
<span class="n">stageDir</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">stageIdx</span><span class="p">)</span><span class="o">.</span><span class="n">zfill</span><span class="p">(</span><span class="n">stageIdxDigits</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;_&quot;</span> <span class="o">+</span> <span class="n">stageUid</span>
<span class="n">stagePath</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">stagesDir</span><span class="p">,</span> <span class="n">stageDir</span><span class="p">)</span>
<span class="k">return</span> <span class="n">stagePath</span>
</pre></div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
</div>
</main>
</div>
</div>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>