| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.ml.feature — PySpark 4.0.0-preview2 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/ml/feature';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/feature.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/ml/feature.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/ml/feature.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.ml.feature</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.ml.feature</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">overload</span><span class="p">,</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Generic</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">TypeVar</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">keyword_only</span><span class="p">,</span> <span class="n">since</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">_convert_to_vector</span><span class="p">,</span> <span class="n">DenseMatrix</span><span class="p">,</span> <span class="n">DenseVector</span><span class="p">,</span> <span class="n">Vector</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">HasThreshold</span><span class="p">,</span> |
| <span class="n">HasThresholds</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCols</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">HasRelativeError</span><span class="p">,</span> |
| <span class="n">HasFeaturesCol</span><span class="p">,</span> |
| <span class="n">HasLabelCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">HasNumFeatures</span><span class="p">,</span> |
| <span class="n">HasStepSize</span><span class="p">,</span> |
| <span class="n">HasMaxIter</span><span class="p">,</span> |
| <span class="n">TypeConverters</span><span class="p">,</span> |
| <span class="n">Param</span><span class="p">,</span> |
| <span class="n">Params</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaParams</span><span class="p">,</span> <span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">_jvm</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| |
| <span class="n">JM</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"JM"</span><span class="p">,</span> <span class="n">bound</span><span class="o">=</span><span class="n">JavaTransformer</span><span class="p">)</span> |
| <span class="n">P</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"P"</span><span class="p">,</span> <span class="n">bound</span><span class="o">=</span><span class="n">Params</span><span class="p">)</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="s2">"Binarizer"</span><span class="p">,</span> |
| <span class="s2">"BucketedRandomProjectionLSH"</span><span class="p">,</span> |
| <span class="s2">"BucketedRandomProjectionLSHModel"</span><span class="p">,</span> |
| <span class="s2">"Bucketizer"</span><span class="p">,</span> |
| <span class="s2">"ChiSqSelector"</span><span class="p">,</span> |
| <span class="s2">"ChiSqSelectorModel"</span><span class="p">,</span> |
| <span class="s2">"CountVectorizer"</span><span class="p">,</span> |
| <span class="s2">"CountVectorizerModel"</span><span class="p">,</span> |
| <span class="s2">"DCT"</span><span class="p">,</span> |
| <span class="s2">"ElementwiseProduct"</span><span class="p">,</span> |
| <span class="s2">"FeatureHasher"</span><span class="p">,</span> |
| <span class="s2">"HashingTF"</span><span class="p">,</span> |
| <span class="s2">"IDF"</span><span class="p">,</span> |
| <span class="s2">"IDFModel"</span><span class="p">,</span> |
| <span class="s2">"Imputer"</span><span class="p">,</span> |
| <span class="s2">"ImputerModel"</span><span class="p">,</span> |
| <span class="s2">"IndexToString"</span><span class="p">,</span> |
| <span class="s2">"Interaction"</span><span class="p">,</span> |
| <span class="s2">"MaxAbsScaler"</span><span class="p">,</span> |
| <span class="s2">"MaxAbsScalerModel"</span><span class="p">,</span> |
| <span class="s2">"MinHashLSH"</span><span class="p">,</span> |
| <span class="s2">"MinHashLSHModel"</span><span class="p">,</span> |
| <span class="s2">"MinMaxScaler"</span><span class="p">,</span> |
| <span class="s2">"MinMaxScalerModel"</span><span class="p">,</span> |
| <span class="s2">"NGram"</span><span class="p">,</span> |
| <span class="s2">"Normalizer"</span><span class="p">,</span> |
| <span class="s2">"OneHotEncoder"</span><span class="p">,</span> |
| <span class="s2">"OneHotEncoderModel"</span><span class="p">,</span> |
| <span class="s2">"PCA"</span><span class="p">,</span> |
| <span class="s2">"PCAModel"</span><span class="p">,</span> |
| <span class="s2">"PolynomialExpansion"</span><span class="p">,</span> |
| <span class="s2">"QuantileDiscretizer"</span><span class="p">,</span> |
| <span class="s2">"RobustScaler"</span><span class="p">,</span> |
| <span class="s2">"RobustScalerModel"</span><span class="p">,</span> |
| <span class="s2">"RegexTokenizer"</span><span class="p">,</span> |
| <span class="s2">"RFormula"</span><span class="p">,</span> |
| <span class="s2">"RFormulaModel"</span><span class="p">,</span> |
| <span class="s2">"SQLTransformer"</span><span class="p">,</span> |
| <span class="s2">"StandardScaler"</span><span class="p">,</span> |
| <span class="s2">"StandardScalerModel"</span><span class="p">,</span> |
| <span class="s2">"StopWordsRemover"</span><span class="p">,</span> |
| <span class="s2">"StringIndexer"</span><span class="p">,</span> |
| <span class="s2">"StringIndexerModel"</span><span class="p">,</span> |
| <span class="s2">"Tokenizer"</span><span class="p">,</span> |
| <span class="s2">"UnivariateFeatureSelector"</span><span class="p">,</span> |
| <span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">,</span> |
| <span class="s2">"VarianceThresholdSelector"</span><span class="p">,</span> |
| <span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">,</span> |
| <span class="s2">"VectorAssembler"</span><span class="p">,</span> |
| <span class="s2">"VectorIndexer"</span><span class="p">,</span> |
| <span class="s2">"VectorIndexerModel"</span><span class="p">,</span> |
| <span class="s2">"VectorSizeHint"</span><span class="p">,</span> |
| <span class="s2">"VectorSlicer"</span><span class="p">,</span> |
| <span class="s2">"Word2Vec"</span><span class="p">,</span> |
| <span class="s2">"Word2VecModel"</span><span class="p">,</span> |
| <span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="Binarizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Binarizer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasThreshold</span><span class="p">,</span> |
| <span class="n">HasThresholds</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCols</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Binarizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Binarize a column of continuous features given a threshold. Since 3.0.0,</span> |
| <span class="sd"> :py:class:`Binarize` can map multiple columns at once by setting the :py:attr:`inputCols`</span> |
| <span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span> |
| <span class="sd"> are set, an Exception will be thrown. The :py:attr:`threshold` parameter is used for</span> |
| <span class="sd"> single column usage, and :py:attr:`thresholds` is for multiple columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0.5,)], ["values"])</span> |
| <span class="sd"> >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")</span> |
| <span class="sd"> >>> binarizer.setThreshold(1.0)</span> |
| <span class="sd"> Binarizer...</span> |
| <span class="sd"> >>> binarizer.setInputCol("values")</span> |
| <span class="sd"> Binarizer...</span> |
| <span class="sd"> >>> binarizer.setOutputCol("features")</span> |
| <span class="sd"> Binarizer...</span> |
| <span class="sd"> >>> binarizer.transform(df).head().features</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> >>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}</span> |
| <span class="sd"> >>> binarizer.transform(df, params).head().vector</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> >>> binarizerPath = temp_path + "/binarizer"</span> |
| <span class="sd"> >>> binarizer.save(binarizerPath)</span> |
| <span class="sd"> >>> loadedBinarizer = Binarizer.load(binarizerPath)</span> |
| <span class="sd"> >>> loadedBinarizer.getThreshold() == binarizer.getThreshold()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(0.5, 0.3)], ["values1", "values2"])</span> |
| <span class="sd"> >>> binarizer2 = Binarizer(thresholds=[0.0, 1.0])</span> |
| <span class="sd"> >>> binarizer2.setInputCols(["values1", "values2"]).setOutputCols(["output1", "output2"])</span> |
| <span class="sd"> Binarizer...</span> |
| <span class="sd"> >>> binarizer2.transform(df2).show()</span> |
| <span class="sd"> +-------+-------+-------+-------+</span> |
| <span class="sd"> |values1|values2|output1|output2|</span> |
| <span class="sd"> +-------+-------+-------+-------+</span> |
| <span class="sd"> | 0.5| 0.3| 1.0| 0.0|</span> |
| <span class="sd"> +-------+-------+-------+-------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">threshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"threshold"</span><span class="p">,</span> |
| <span class="s2">"Param for threshold used to binarize continuous features. "</span> |
| <span class="o">+</span> <span class="s2">"The features greater than the threshold will be binarized to 1.0. "</span> |
| <span class="o">+</span> <span class="s2">"The features equal to or less than the threshold will be binarized to 0.0"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">thresholds</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"thresholds"</span><span class="p">,</span> |
| <span class="s2">"Param for array of threshold used to binarize continuous features. "</span> |
| <span class="o">+</span> <span class="s2">"This is for multiple columns input. If transforming multiple columns "</span> |
| <span class="o">+</span> <span class="s2">"and thresholds is not set, but threshold is set, then threshold will "</span> |
| <span class="o">+</span> <span class="s2">"be applied across all columns."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span> |
| <span class="sd"> inputCols=None, outputCols=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Binarizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Binarizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="Binarizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">thresholds</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \</span> |
| <span class="sd"> inputCols=None, outputCols=None)</span> |
| <span class="sd"> Sets params for this Binarizer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`threshold`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setThresholds"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setThresholds">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setThresholds</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`thresholds`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">thresholds</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Binarizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Binarizer.html#pyspark.ml.feature.Binarizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Binarizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_LSHParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mixin for Locality Sensitive Hashing (LSH) algorithm parameters.</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">numHashTables</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"numHashTables"</span><span class="p">,</span> |
| <span class="s2">"number of hash tables, where "</span> |
| <span class="o">+</span> <span class="s2">"increasing number of hash tables lowers the false negative rate, "</span> |
| <span class="o">+</span> <span class="s2">"and decreasing it improves the running performance."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_LSHParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">getNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of numHashTables or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numHashTables</span><span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_LSH</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">JM</span><span class="p">],</span> <span class="n">_LSHParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">JM</span><span class="p">]):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mixin for Locality Sensitive Hashing (LSH).</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">setNumHashTables</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numHashTables`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numHashTables</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_LSHModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_LSHParams</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mixin for Locality Sensitive Hashing (LSH) models.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">approxNearestNeighbors</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">dataset</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">key</span><span class="p">:</span> <span class="n">Vector</span><span class="p">,</span> |
| <span class="n">numNearestNeighbors</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">distCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"distCol"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Given a large dataset and an item, approximately find at most k items which have the</span> |
| <span class="sd"> closest distance to the item. If the :py:attr:`outputCol` is missing, the method will</span> |
| <span class="sd"> transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows</span> |
| <span class="sd"> caching of the transformed data when necessary.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method is experimental and will likely change behavior in the next release.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dataset : :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> The dataset to search for nearest neighbors of the key.</span> |
| <span class="sd"> key : :py:class:`pyspark.ml.linalg.Vector`</span> |
| <span class="sd"> Feature vector representing the item to search for.</span> |
| <span class="sd"> numNearestNeighbors : int</span> |
| <span class="sd"> The maximum number of nearest neighbors.</span> |
| <span class="sd"> distCol : str</span> |
| <span class="sd"> Output column for storing the distance between each result row and the key.</span> |
| <span class="sd"> Use "distCol" as default value if it's not specified.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> A dataset containing at most k items closest to the key. A column "distCol" is</span> |
| <span class="sd"> added to show the distance between each row and the key.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"approxNearestNeighbors"</span><span class="p">,</span> <span class="n">dataset</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">numNearestNeighbors</span><span class="p">,</span> <span class="n">distCol</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">approxSimilarityJoin</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">datasetA</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">datasetB</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="n">distCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"distCol"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Join two datasets to approximately find all pairs of rows whose distance are smaller than</span> |
| <span class="sd"> the threshold. If the :py:attr:`outputCol` is missing, the method will transform the data;</span> |
| <span class="sd"> if the :py:attr:`outputCol` exists, it will use that. This allows caching of the</span> |
| <span class="sd"> transformed data when necessary.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> datasetA : :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> One of the datasets to join.</span> |
| <span class="sd"> datasetB : :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> Another dataset to join.</span> |
| <span class="sd"> threshold : float</span> |
| <span class="sd"> The threshold for the distance of row pairs.</span> |
| <span class="sd"> distCol : str, optional</span> |
| <span class="sd"> Output column for storing the distance between each pair of rows. Use</span> |
| <span class="sd"> "distCol" as default value if it's not specified.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> A joined dataset containing pairs of rows. The original rows are in columns</span> |
| <span class="sd"> "datasetA" and "datasetB", and a column "distCol" is added to show the distance</span> |
| <span class="sd"> between each pair.</span> |
| <span class="sd"> """</span> |
| <span class="n">threshold</span> <span class="o">=</span> <span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">(</span><span class="n">threshold</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"approxSimilarityJoin"</span><span class="p">,</span> <span class="n">datasetA</span><span class="p">,</span> <span class="n">datasetB</span><span class="p">,</span> <span class="n">threshold</span><span class="p">,</span> <span class="n">distCol</span><span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_BucketedRandomProjectionLSHParams</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`BucketedRandomProjectionLSH` and</span> |
| <span class="sd"> :py:class:`BucketedRandomProjectionLSHModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">bucketLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"bucketLength"</span><span class="p">,</span> |
| <span class="s2">"the length of each hash bucket, "</span> <span class="o">+</span> <span class="s2">"a larger bucket lowers the false negative rate."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of bucketLength or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Params</span><span class="p">,</span> <span class="bp">self</span><span class="p">))</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bucketLength</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="BucketedRandomProjectionLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">BucketedRandomProjectionLSH</span><span class="p">(</span> |
| <span class="n">_LSH</span><span class="p">[</span><span class="s2">"BucketedRandomProjectionLSHModel"</span><span class="p">],</span> |
| <span class="n">_LSHParams</span><span class="p">,</span> |
| <span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"BucketedRandomProjectionLSH"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> LSH class for Euclidean distance metrics.</span> |
| <span class="sd"> The input is dense or sparse vectors, each of which represents a point in the Euclidean</span> |
| <span class="sd"> distance space. The output will be vectors of configurable dimension. Hash values in the same</span> |
| <span class="sd"> dimension are calculated by the same hash function.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| |
| <span class="sd"> - `Stable Distributions in Wikipedia article on Locality-sensitive hashing \</span> |
| <span class="sd"> <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_</span> |
| <span class="sd"> - `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> data = [(0, Vectors.dense([-1.0, -1.0 ]),),</span> |
| <span class="sd"> ... (1, Vectors.dense([-1.0, 1.0 ]),),</span> |
| <span class="sd"> ... (2, Vectors.dense([1.0, -1.0 ]),),</span> |
| <span class="sd"> ... (3, Vectors.dense([1.0, 1.0]),)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["id", "features"])</span> |
| <span class="sd"> >>> brp = BucketedRandomProjectionLSH()</span> |
| <span class="sd"> >>> brp.setInputCol("features")</span> |
| <span class="sd"> BucketedRandomProjectionLSH...</span> |
| <span class="sd"> >>> brp.setOutputCol("hashes")</span> |
| <span class="sd"> BucketedRandomProjectionLSH...</span> |
| <span class="sd"> >>> brp.setSeed(12345)</span> |
| <span class="sd"> BucketedRandomProjectionLSH...</span> |
| <span class="sd"> >>> brp.setBucketLength(1.0)</span> |
| <span class="sd"> BucketedRandomProjectionLSH...</span> |
| <span class="sd"> >>> model = brp.fit(df)</span> |
| <span class="sd"> >>> model.getBucketLength()</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> >>> model.setOutputCol("hashes")</span> |
| <span class="sd"> BucketedRandomProjectionLSHModel...</span> |
| <span class="sd"> >>> model.transform(df).head()</span> |
| <span class="sd"> Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])])</span> |
| <span class="sd"> >>> data2 = [(4, Vectors.dense([2.0, 2.0 ]),),</span> |
| <span class="sd"> ... (5, Vectors.dense([2.0, 3.0 ]),),</span> |
| <span class="sd"> ... (6, Vectors.dense([3.0, 2.0 ]),),</span> |
| <span class="sd"> ... (7, Vectors.dense([3.0, 3.0]),)]</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame(data2, ["id", "features"])</span> |
| <span class="sd"> >>> model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()</span> |
| <span class="sd"> [Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]</span> |
| <span class="sd"> >>> model.approxSimilarityJoin(df, df2, 3.0, distCol="EuclideanDistance").select(</span> |
| <span class="sd"> ... col("datasetA.id").alias("idA"),</span> |
| <span class="sd"> ... col("datasetB.id").alias("idB"),</span> |
| <span class="sd"> ... col("EuclideanDistance")).show()</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> |idA|idB|EuclideanDistance|</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> | 3| 6| 2.23606797749979|</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model.approxSimilarityJoin(df, df2, 3, distCol="EuclideanDistance").select(</span> |
| <span class="sd"> ... col("datasetA.id").alias("idA"),</span> |
| <span class="sd"> ... col("datasetB.id").alias("idB"),</span> |
| <span class="sd"> ... col("EuclideanDistance")).show()</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> |idA|idB|EuclideanDistance|</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> | 3| 6| 2.23606797749979|</span> |
| <span class="sd"> +---+---+-----------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> brpPath = temp_path + "/brp"</span> |
| <span class="sd"> >>> brp.save(brpPath)</span> |
| <span class="sd"> >>> brp2 = BucketedRandomProjectionLSH.load(brpPath)</span> |
| <span class="sd"> >>> brp2.getBucketLength() == brp.getBucketLength()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/brp-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> model2 = BucketedRandomProjectionLSHModel.load(modelPath)</span> |
| <span class="sd"> >>> model.transform(df).head().hashes == model2.transform(df).head().hashes</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">bucketLength</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span> |
| <span class="sd"> bucketLength=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">BucketedRandomProjectionLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.BucketedRandomProjectionLSH"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="BucketedRandomProjectionLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">bucketLength</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"BucketedRandomProjectionLSH"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \</span> |
| <span class="sd"> bucketLength=None)</span> |
| <span class="sd"> Sets params for this BucketedRandomProjectionLSH.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BucketedRandomProjectionLSH.setBucketLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setBucketLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setBucketLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BucketedRandomProjectionLSH"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`bucketLength`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">bucketLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BucketedRandomProjectionLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSH.html#pyspark.ml.feature.BucketedRandomProjectionLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BucketedRandomProjectionLSH"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BucketedRandomProjectionLSHModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">BucketedRandomProjectionLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="BucketedRandomProjectionLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.BucketedRandomProjectionLSHModel.html#pyspark.ml.feature.BucketedRandomProjectionLSHModel">[docs]</a><span class="k">class</span> <span class="nc">BucketedRandomProjectionLSHModel</span><span class="p">(</span> |
| <span class="n">_LSHModel</span><span class="p">,</span> |
| <span class="n">_BucketedRandomProjectionLSHParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"BucketedRandomProjectionLSHModel"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are</span> |
| <span class="sd"> stored. The vectors are normalized to be unit vectors and each vector is used in a hash</span> |
| <span class="sd"> function: :math:`h_i(x) = floor(r_i \cdot x / bucketLength)` where :math:`r_i` is the</span> |
| <span class="sd"> i-th random unit vector. The number of buckets will be `(max L2 norm of input vectors) /</span> |
| <span class="sd"> bucketLength`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| <span class="sd"> """</span></div> |
| |
| |
| <div class="viewcode-block" id="Bucketizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Bucketizer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCols</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Bucketizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Maps a column of continuous features to a column of feature buckets. Since 3.0.0,</span> |
| <span class="sd"> :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`</span> |
| <span class="sd"> parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters</span> |
| <span class="sd"> are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single</span> |
| <span class="sd"> column usage, and :py:attr:`splitsArray` is for multiple columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),</span> |
| <span class="sd"> ... (float("nan"), 1.0), (float("nan"), 0.0)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(values, ["values1", "values2"])</span> |
| <span class="sd"> >>> bucketizer = Bucketizer()</span> |
| <span class="sd"> >>> bucketizer.setSplits([-float("inf"), 0.5, 1.4, float("inf")])</span> |
| <span class="sd"> Bucketizer...</span> |
| <span class="sd"> >>> bucketizer.setInputCol("values1")</span> |
| <span class="sd"> Bucketizer...</span> |
| <span class="sd"> >>> bucketizer.setOutputCol("buckets")</span> |
| <span class="sd"> Bucketizer...</span> |
| <span class="sd"> >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()</span> |
| <span class="sd"> >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))</span> |
| <span class="sd"> >>> bucketed.show(truncate=False)</span> |
| <span class="sd"> +-------+-------+</span> |
| <span class="sd"> |values1|buckets|</span> |
| <span class="sd"> +-------+-------+</span> |
| <span class="sd"> |0.1 |0.0 |</span> |
| <span class="sd"> |0.4 |0.0 |</span> |
| <span class="sd"> |1.2 |1.0 |</span> |
| <span class="sd"> |1.5 |2.0 |</span> |
| <span class="sd"> |NaN |3.0 |</span> |
| <span class="sd"> |NaN |3.0 |</span> |
| <span class="sd"> +-------+-------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> bucketizer.setParams(outputCol="b").transform(df).head().b</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> >>> bucketizerPath = temp_path + "/bucketizer"</span> |
| <span class="sd"> >>> bucketizer.save(bucketizerPath)</span> |
| <span class="sd"> >>> loadedBucketizer = Bucketizer.load(bucketizerPath)</span> |
| <span class="sd"> >>> loadedBucketizer.getSplits() == bucketizer.getSplits()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()</span> |
| <span class="sd"> >>> len(bucketed)</span> |
| <span class="sd"> 4</span> |
| <span class="sd"> >>> bucketizer2 = Bucketizer(splitsArray=</span> |
| <span class="sd"> ... [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],</span> |
| <span class="sd"> ... inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])</span> |
| <span class="sd"> >>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)</span> |
| <span class="sd"> >>> bucketed2.show(truncate=False)</span> |
| <span class="sd"> +-------+-------+--------+--------+</span> |
| <span class="sd"> |values1|values2|buckets1|buckets2|</span> |
| <span class="sd"> +-------+-------+--------+--------+</span> |
| <span class="sd"> |0.1 |0.0 |0.0 |0.0 |</span> |
| <span class="sd"> |0.4 |1.0 |0.0 |1.0 |</span> |
| <span class="sd"> |1.2 |1.3 |1.0 |1.0 |</span> |
| <span class="sd"> |1.5 |NaN |2.0 |2.0 |</span> |
| <span class="sd"> |NaN |1.0 |3.0 |1.0 |</span> |
| <span class="sd"> |NaN |0.0 |3.0 |0.0 |</span> |
| <span class="sd"> +-------+-------+--------+--------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">splits</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"splits"</span><span class="p">,</span> |
| <span class="s2">"Split points for mapping continuous features into buckets. With n+1 splits, "</span> |
| <span class="o">+</span> <span class="s2">"there are n buckets. A bucket defined by splits x,y holds values in the "</span> |
| <span class="o">+</span> <span class="s2">"range [x,y) except the last bucket, which also includes y. The splits "</span> |
| <span class="o">+</span> <span class="s2">"should be of length >= 3 and strictly increasing. Values at -inf, inf must be "</span> |
| <span class="o">+</span> <span class="s2">"explicitly provided to cover all Double values; otherwise, values outside the "</span> |
| <span class="o">+</span> <span class="s2">"splits specified will be treated as errors."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"how to handle invalid entries "</span> |
| <span class="s2">"containing NaN values. Values outside the splits will always be treated "</span> |
| <span class="s2">"as errors. Options are 'skip' (filter out rows with invalid values), "</span> |
| <span class="o">+</span> <span class="s2">"'error' (throw an error), or 'keep' (keep invalid values in a "</span> |
| <span class="o">+</span> <span class="s2">"special additional bucket). Note that in the multiple column "</span> |
| <span class="o">+</span> <span class="s2">"case, the invalid handling is applied to all columns. That said "</span> |
| <span class="o">+</span> <span class="s2">"for 'error' it will throw an error if any invalids are found in "</span> |
| <span class="o">+</span> <span class="s2">"any column, for 'skip' it will skip rows with any invalids in "</span> |
| <span class="o">+</span> <span class="s2">"any columns, etc."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">splitsArray</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"splitsArray"</span><span class="p">,</span> |
| <span class="s2">"The array of split points for mapping "</span> |
| <span class="o">+</span> <span class="s2">"continuous features into buckets for multiple columns. For each input "</span> |
| <span class="o">+</span> <span class="s2">"column, with n+1 splits, there are n buckets. A bucket defined by "</span> |
| <span class="o">+</span> <span class="s2">"splits x,y holds values in the range [x,y) except the last bucket, "</span> |
| <span class="o">+</span> <span class="s2">"which also includes y. The splits should be of length >= 3 and "</span> |
| <span class="o">+</span> <span class="s2">"strictly increasing. Values at -inf, inf must be explicitly provided "</span> |
| <span class="o">+</span> <span class="s2">"to cover all Double values; otherwise, values outside the splits "</span> |
| <span class="o">+</span> <span class="s2">"specified will be treated as errors."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListListFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \</span> |
| <span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Bucketizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Bucketizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="Bucketizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">splits</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">splitsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \</span> |
| <span class="sd"> splitsArray=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> Sets params for this Bucketizer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`splits`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.getSplits"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplits">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of threshold or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splits</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`splitsArray`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">splitsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.getSplitsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.getSplitsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSplitsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the array of split points or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splitsArray</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Bucketizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Bucketizer.html#pyspark.ml.feature.Bucketizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Bucketizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_CountVectorizerParams</span><span class="p">(</span><span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`CountVectorizer` and :py:class:`CountVectorizerModel`.</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">minTF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minTF"</span><span class="p">,</span> |
| <span class="s2">"Filter to ignore rare words in"</span> |
| <span class="o">+</span> <span class="s2">" a document. For each document, terms with frequency/count less than the given"</span> |
| <span class="o">+</span> <span class="s2">" threshold are ignored. If this is an integer >= 1, then this specifies a count (of"</span> |
| <span class="o">+</span> <span class="s2">" times the term must appear in the document); if this is a double in [0,1), then this "</span> |
| <span class="o">+</span> <span class="s2">"specifies a fraction (out of the document's token count). Note that the parameter is "</span> |
| <span class="o">+</span> <span class="s2">"only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">minDF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minDF"</span><span class="p">,</span> |
| <span class="s2">"Specifies the minimum number of"</span> |
| <span class="o">+</span> <span class="s2">" different documents a term must appear in to be included in the vocabulary."</span> |
| <span class="o">+</span> <span class="s2">" If this is an integer >= 1, this specifies the number of documents the term must"</span> |
| <span class="o">+</span> <span class="s2">" appear in; if this is a double in [0,1), then this specifies the fraction of documents."</span> |
| <span class="o">+</span> <span class="s2">" Default 1.0"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">maxDF</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"maxDF"</span><span class="p">,</span> |
| <span class="s2">"Specifies the maximum number of"</span> |
| <span class="o">+</span> <span class="s2">" different documents a term could appear in to be included in the vocabulary."</span> |
| <span class="o">+</span> <span class="s2">" A term that appears more than the threshold will be ignored. If this is an"</span> |
| <span class="o">+</span> <span class="s2">" integer >= 1, this specifies the maximum number of documents the term could appear in;"</span> |
| <span class="o">+</span> <span class="s2">" if this is a double in [0,1), then this specifies the maximum"</span> |
| <span class="o">+</span> <span class="s2">" fraction of documents the term could appear in."</span> |
| <span class="o">+</span> <span class="s2">" Default (2^63) - 1"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">vocabSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"vocabSize"</span><span class="p">,</span> |
| <span class="s2">"max size of the vocabulary. Default 1 << 18."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"binary"</span><span class="p">,</span> |
| <span class="s2">"Binary toggle to control the output vector values."</span> |
| <span class="o">+</span> <span class="s2">" If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful"</span> |
| <span class="o">+</span> <span class="s2">" for discrete probabilistic models that model binary events rather than integer counts."</span> |
| <span class="o">+</span> <span class="s2">" Default False"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">maxDF</span><span class="o">=</span><span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of minTF or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTF</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of minDF or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDF</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of maxDF or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxDF</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of vocabSize or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocabSize</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of binary or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="CountVectorizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">CountVectorizer</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"CountVectorizerModel"</span><span class="p">],</span> |
| <span class="n">_CountVectorizerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"CountVectorizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],</span> |
| <span class="sd"> ... ["label", "raw"])</span> |
| <span class="sd"> >>> cv = CountVectorizer()</span> |
| <span class="sd"> >>> cv.setInputCol("raw")</span> |
| <span class="sd"> CountVectorizer...</span> |
| <span class="sd"> >>> cv.setOutputCol("vectors")</span> |
| <span class="sd"> CountVectorizer...</span> |
| <span class="sd"> >>> model = cv.fit(df)</span> |
| <span class="sd"> >>> model.setInputCol("raw")</span> |
| <span class="sd"> CountVectorizerModel...</span> |
| <span class="sd"> >>> model.transform(df).show(truncate=False)</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> |label|raw |vectors |</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span> |
| <span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> sorted(model.vocabulary) == ['a', 'b', 'c']</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> countVectorizerPath = temp_path + "/count-vectorizer"</span> |
| <span class="sd"> >>> cv.save(countVectorizerPath)</span> |
| <span class="sd"> >>> loadedCv = CountVectorizer.load(countVectorizerPath)</span> |
| <span class="sd"> >>> loadedCv.getMinDF() == cv.getMinDF()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedCv.getMinTF() == cv.getMinTF()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedCv.getVocabSize() == cv.getVocabSize()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/count-vectorizer-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = CountVectorizerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.vocabulary == model.vocabulary</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> fromVocabModel = CountVectorizerModel.from_vocabulary(["a", "b", "c"],</span> |
| <span class="sd"> ... inputCol="raw", outputCol="vectors")</span> |
| <span class="sd"> >>> fromVocabModel.transform(df).show(truncate=False)</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> |label|raw |vectors |</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|</span> |
| <span class="sd"> |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|</span> |
| <span class="sd"> +-----+---------------+-------------------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minTF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">minDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">maxDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">vocabSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,\</span> |
| <span class="sd"> binary=False, inputCol=None,outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">CountVectorizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.CountVectorizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="CountVectorizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minTF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">minDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">maxDF</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">2</span><span class="o">**</span><span class="mi">63</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">vocabSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,\</span> |
| <span class="sd"> binary=False, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Set the params for the CountVectorizer</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minTF`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setMinDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMinDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minDF`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setMaxDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setMaxDF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxDF`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxDF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setVocabSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setVocabSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setVocabSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`vocabSize`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`binary`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizer.html#pyspark.ml.feature.CountVectorizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">CountVectorizerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="CountVectorizerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">CountVectorizerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_CountVectorizerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"CountVectorizerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`CountVectorizer`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="CountVectorizerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizerModel.from_vocabulary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.from_vocabulary">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">from_vocabulary</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">vocabulary</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">minTF</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Construct the model directly from a vocabulary list of strings,</span> |
| <span class="sd"> requires an active SparkContext.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> |
| <span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span> |
| <span class="n">jvocab</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">CountVectorizerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.CountVectorizerModel"</span><span class="p">,</span> <span class="n">jvocab</span> |
| <span class="p">)</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">minTF</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setMinTF</span><span class="p">(</span><span class="n">minTF</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">binary</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setBinary</span><span class="p">(</span><span class="n">binary</span><span class="p">)</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vocabSize</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">vocabulary</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">model</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">vocabulary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> An array of terms in the vocabulary.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"vocabulary"</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="CountVectorizerModel.setMinTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setMinTF">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinTF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minTF`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTF</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CountVectorizerModel.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.CountVectorizerModel.html#pyspark.ml.feature.CountVectorizerModel.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CountVectorizerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`binary`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="DCT"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">DCT</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"DCT"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A feature transformer that takes the 1D discrete cosine transform</span> |
| <span class="sd"> of a real vector. No zero padding is performed on the input vector.</span> |
| <span class="sd"> It returns a real vector of the same length representing the DCT.</span> |
| <span class="sd"> The return vector is scaled such that the transform matrix is</span> |
| <span class="sd"> unitary (aka scaled DCT-II).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `More information on Wikipedia \</span> |
| <span class="sd"> <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])</span> |
| <span class="sd"> >>> dct = DCT( )</span> |
| <span class="sd"> >>> dct.setInverse(False)</span> |
| <span class="sd"> DCT...</span> |
| <span class="sd"> >>> dct.setInputCol("vec")</span> |
| <span class="sd"> DCT...</span> |
| <span class="sd"> >>> dct.setOutputCol("resultVec")</span> |
| <span class="sd"> DCT...</span> |
| <span class="sd"> >>> df2 = dct.transform(df1)</span> |
| <span class="sd"> >>> df2.head().resultVec</span> |
| <span class="sd"> DenseVector([10.969..., -0.707..., -2.041...])</span> |
| <span class="sd"> >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)</span> |
| <span class="sd"> >>> df3.head().origVec</span> |
| <span class="sd"> DenseVector([5.0, 8.0, 6.0])</span> |
| <span class="sd"> >>> dctPath = temp_path + "/dct"</span> |
| <span class="sd"> >>> dct.save(dctPath)</span> |
| <span class="sd"> >>> loadedDtc = DCT.load(dctPath)</span> |
| <span class="sd"> >>> loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedDtc.getInverse()</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">inverse</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"inverse"</span><span class="p">,</span> |
| <span class="s2">"Set transformer to perform inverse DCT, "</span> <span class="o">+</span> <span class="s2">"default False."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inverse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inverse=False, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">DCT</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.DCT"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DCT.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inverse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DCT"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inverse=False, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this DCT.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DCT.setInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DCT"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inverse`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DCT.getInverse"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.getInverse">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getInverse</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of inverse or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inverse</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DCT.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DCT"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DCT.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.DCT.html#pyspark.ml.feature.DCT.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DCT"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="ElementwiseProduct"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">ElementwiseProduct</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"ElementwiseProduct"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Outputs the Hadamard product (i.e., the element-wise product) of each input vector</span> |
| <span class="sd"> with a provided "weight" vector. In other words, it scales each column of the dataset</span> |
| <span class="sd"> by a scalar multiplier.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])</span> |
| <span class="sd"> >>> ep = ElementwiseProduct()</span> |
| <span class="sd"> >>> ep.setScalingVec(Vectors.dense([1.0, 2.0, 3.0]))</span> |
| <span class="sd"> ElementwiseProduct...</span> |
| <span class="sd"> >>> ep.setInputCol("values")</span> |
| <span class="sd"> ElementwiseProduct...</span> |
| <span class="sd"> >>> ep.setOutputCol("eprod")</span> |
| <span class="sd"> ElementwiseProduct...</span> |
| <span class="sd"> >>> ep.transform(df).head().eprod</span> |
| <span class="sd"> DenseVector([2.0, 2.0, 9.0])</span> |
| <span class="sd"> >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod</span> |
| <span class="sd"> DenseVector([4.0, 3.0, 15.0])</span> |
| <span class="sd"> >>> elementwiseProductPath = temp_path + "/elementwise-product"</span> |
| <span class="sd"> >>> ep.save(elementwiseProductPath)</span> |
| <span class="sd"> >>> loadedEp = ElementwiseProduct.load(elementwiseProductPath)</span> |
| <span class="sd"> >>> loadedEp.getScalingVec() == ep.getScalingVec()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedEp.transform(df).take(1) == ep.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">scalingVec</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"scalingVec"</span><span class="p">,</span> |
| <span class="s2">"Vector for hadamard product."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toVector</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">scalingVec</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">ElementwiseProduct</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.ElementwiseProduct"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="ElementwiseProduct.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">scalingVec</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Vector</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"ElementwiseProduct"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, scalingVec=None, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this ElementwiseProduct.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ElementwiseProduct.setScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ElementwiseProduct"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`scalingVec`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ElementwiseProduct.getScalingVec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.getScalingVec">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of scalingVec or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ElementwiseProduct.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ElementwiseProduct"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ElementwiseProduct.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ElementwiseProduct.html#pyspark.ml.feature.ElementwiseProduct.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ElementwiseProduct"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="FeatureHasher"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">FeatureHasher</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasNumFeatures</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"FeatureHasher"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Feature hashing projects a set of categorical or numerical features into a feature vector of</span> |
| <span class="sd"> specified dimension (typically substantially smaller than that of the original feature</span> |
| <span class="sd"> space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)</span> |
| <span class="sd"> to map features to indices in the feature vector.</span> |
| |
| <span class="sd"> The FeatureHasher transformer operates on multiple columns. Each column may contain either</span> |
| <span class="sd"> numeric or categorical features. Behavior and handling of column data types is as follows:</span> |
| |
| <span class="sd"> * Numeric columns:</span> |
| <span class="sd"> For numeric features, the hash value of the column name is used to map the</span> |
| <span class="sd"> feature value to its index in the feature vector. By default, numeric features</span> |
| <span class="sd"> are not treated as categorical (even when they are integers). To treat them</span> |
| <span class="sd"> as categorical, specify the relevant columns in `categoricalCols`.</span> |
| |
| <span class="sd"> * String columns:</span> |
| <span class="sd"> For categorical features, the hash value of the string "column_name=value"</span> |
| <span class="sd"> is used to map to the vector index, with an indicator value of `1.0`.</span> |
| <span class="sd"> Thus, categorical features are "one-hot" encoded</span> |
| <span class="sd"> (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).</span> |
| |
| <span class="sd"> * Boolean columns:</span> |
| <span class="sd"> Boolean values are treated in the same way as string columns. That is,</span> |
| <span class="sd"> boolean features are represented as "column_name=true" or "column_name=false",</span> |
| <span class="sd"> with an indicator value of `1.0`.</span> |
| |
| <span class="sd"> Null (missing) values are ignored (implicitly zero in the resulting feature vector).</span> |
| |
| <span class="sd"> Since a simple modulo is used to transform the hash function to a vector index,</span> |
| <span class="sd"> it is advisable to use a power of two as the `numFeatures` parameter;</span> |
| <span class="sd"> otherwise the features will not be mapped evenly to the vector indices.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]</span> |
| <span class="sd"> >>> cols = ["real", "bool", "stringNum", "string"]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, cols)</span> |
| <span class="sd"> >>> hasher = FeatureHasher()</span> |
| <span class="sd"> >>> hasher.setInputCols(cols)</span> |
| <span class="sd"> FeatureHasher...</span> |
| <span class="sd"> >>> hasher.setOutputCol("features")</span> |
| <span class="sd"> FeatureHasher...</span> |
| <span class="sd"> >>> hasher.transform(df).head().features</span> |
| <span class="sd"> SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span> |
| <span class="sd"> >>> hasher.setCategoricalCols(["real"]).transform(df).head().features</span> |
| <span class="sd"> SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})</span> |
| <span class="sd"> >>> hasherPath = temp_path + "/hasher"</span> |
| <span class="sd"> >>> hasher.save(hasherPath)</span> |
| <span class="sd"> >>> loadedHasher = FeatureHasher.load(hasherPath)</span> |
| <span class="sd"> >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"categoricalCols"</span><span class="p">,</span> |
| <span class="s2">"numeric columns to treat as categorical"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, numFeatures=1 << 18, inputCols=None, outputCol=None, \</span> |
| <span class="sd"> categoricalCols=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">FeatureHasher</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.FeatureHasher"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="FeatureHasher.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">categoricalCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"FeatureHasher"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, numFeatures=1 << 18, inputCols=None, outputCol=None, \</span> |
| <span class="sd"> categoricalCols=None)</span> |
| <span class="sd"> Sets params for this FeatureHasher.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="FeatureHasher.setCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"FeatureHasher"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`categoricalCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">categoricalCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="FeatureHasher.getCategoricalCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.getCategoricalCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getCategoricalCols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of binary or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">categoricalCols</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="FeatureHasher.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"FeatureHasher"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="FeatureHasher.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"FeatureHasher"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="FeatureHasher.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.FeatureHasher.html#pyspark.ml.feature.FeatureHasher.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"FeatureHasher"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numFeatures`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="HashingTF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">HashingTF</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasNumFeatures</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"HashingTF"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Maps a sequence of terms to their term frequencies using the hashing trick.</span> |
| <span class="sd"> Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)</span> |
| <span class="sd"> to calculate the hash code value for the term object.</span> |
| <span class="sd"> Since a simple modulo is used to transform the hash function to a column index,</span> |
| <span class="sd"> it is advisable to use a power of two as the numFeatures parameter;</span> |
| <span class="sd"> otherwise the features will not be mapped evenly to the columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])</span> |
| <span class="sd"> >>> hashingTF = HashingTF(inputCol="words", outputCol="features")</span> |
| <span class="sd"> >>> hashingTF.setNumFeatures(10)</span> |
| <span class="sd"> HashingTF...</span> |
| <span class="sd"> >>> hashingTF.transform(df).head().features</span> |
| <span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span> |
| <span class="sd"> >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs</span> |
| <span class="sd"> SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})</span> |
| <span class="sd"> >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}</span> |
| <span class="sd"> >>> hashingTF.transform(df, params).head().vector</span> |
| <span class="sd"> SparseVector(5, {0: 1.0, 2: 1.0, 3: 1.0})</span> |
| <span class="sd"> >>> hashingTFPath = temp_path + "/hashing-tf"</span> |
| <span class="sd"> >>> hashingTF.save(hashingTFPath)</span> |
| <span class="sd"> >>> loadedHashingTF = HashingTF.load(hashingTFPath)</span> |
| <span class="sd"> >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> hashingTF.indexOf("b")</span> |
| <span class="sd"> 5</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">binary</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"binary"</span><span class="p">,</span> |
| <span class="s2">"If True, all non zero counts are set to 1. "</span> |
| <span class="o">+</span> <span class="s2">"This is useful for discrete probabilistic models that model binary events "</span> |
| <span class="o">+</span> <span class="s2">"rather than integer counts. Default False."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">HashingTF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.HashingTF"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> <span class="n">binary</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="HashingTF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> |
| <span class="n">binary</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"HashingTF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this HashingTF.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.setBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"HashingTF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`binary`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">binary</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.getBinary"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.getBinary">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getBinary</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of binary or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">binary</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"HashingTF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"HashingTF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.setNumFeatures"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.setNumFeatures">[docs]</a> <span class="k">def</span> <span class="nf">setNumFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"HashingTF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numFeatures`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="HashingTF.indexOf"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF.indexOf">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">indexOf</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">term</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the index of the input term.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_transfer_params_to_java</span><span class="p">()</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">indexOf</span><span class="p">(</span><span class="n">term</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_IDFParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`IDF` and :py:class:`IDFModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">minDocFreq</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minDocFreq"</span><span class="p">,</span> |
| <span class="s2">"minimum number of documents in which a term should appear for filtering"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of minDocFreq or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_IDFParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="IDF"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">IDF</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"IDFModel"</span><span class="p">],</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"IDF"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute the Inverse Document Frequency (IDF) given a collection of documents.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import DenseVector</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),</span> |
| <span class="sd"> ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])</span> |
| <span class="sd"> >>> idf = IDF(minDocFreq=3)</span> |
| <span class="sd"> >>> idf.setInputCol("tf")</span> |
| <span class="sd"> IDF...</span> |
| <span class="sd"> >>> idf.setOutputCol("idf")</span> |
| <span class="sd"> IDF...</span> |
| <span class="sd"> >>> model = idf.fit(df)</span> |
| <span class="sd"> >>> model.setOutputCol("idf")</span> |
| <span class="sd"> IDFModel...</span> |
| <span class="sd"> >>> model.getMinDocFreq()</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> >>> model.idf</span> |
| <span class="sd"> DenseVector([0.0, 0.0])</span> |
| <span class="sd"> >>> model.docFreq</span> |
| <span class="sd"> [0, 3]</span> |
| <span class="sd"> >>> model.numDocs == df.count()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> model.transform(df).head().idf</span> |
| <span class="sd"> DenseVector([0.0, 0.0])</span> |
| <span class="sd"> >>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs</span> |
| <span class="sd"> DenseVector([0.0, 0.0])</span> |
| <span class="sd"> >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}</span> |
| <span class="sd"> >>> idf.fit(df, params).transform(df).head().vector</span> |
| <span class="sd"> DenseVector([0.2877, 0.0])</span> |
| <span class="sd"> >>> idfPath = temp_path + "/idf"</span> |
| <span class="sd"> >>> idf.save(idfPath)</span> |
| <span class="sd"> >>> loadedIdf = IDF.load(idfPath)</span> |
| <span class="sd"> >>> loadedIdf.getMinDocFreq() == idf.getMinDocFreq()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/idf-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = IDFModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.transform(df).head().idf == model.transform(df).head().idf</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minDocFreq</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">IDF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.IDF"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="IDF.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minDocFreq</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"IDF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this IDF.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IDF.setMinDocFreq"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setMinDocFreq">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minDocFreq`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IDF.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IDF.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDF"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDFModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">IDFModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="IDFModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel">[docs]</a><span class="k">class</span> <span class="nc">IDFModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_IDFParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"IDFModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`IDF`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="IDFModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDFModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IDFModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IDFModel.html#pyspark.ml.feature.IDFModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IDFModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">idf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the IDF vector.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"idf"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">docFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the document frequency.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"docFreq"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">numDocs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns number of documents evaluated to compute idf</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"numDocs"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_ImputerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`Imputer` and :py:class:`ImputerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">strategy</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"strategy"</span><span class="p">,</span> |
| <span class="s2">"strategy for imputation. If mean, then replace missing values using the mean "</span> |
| <span class="s2">"value of the feature. If median, then replace missing values using the "</span> |
| <span class="s2">"median value of the feature. If mode, then replace missing using the most "</span> |
| <span class="s2">"frequent value of the feature."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">missingValue</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"missingValue"</span><span class="p">,</span> |
| <span class="s2">"The placeholder for the missing values. All occurrences of missingValue "</span> |
| <span class="s2">"will be imputed."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_ImputerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="s2">"mean"</span><span class="p">,</span> <span class="n">missingValue</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">"nan"</span><span class="p">),</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`strategy` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">strategy</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`missingValue` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">missingValue</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="Imputer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Imputer</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"ImputerModel"</span><span class="p">],</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Imputer"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Imputation estimator for completing missing values, using the mean, median or mode</span> |
| <span class="sd"> of the columns in which the missing values are located. The input columns should be of</span> |
| <span class="sd"> numeric type. Currently Imputer does not support categorical features and</span> |
| <span class="sd"> possibly creates incorrect values for a categorical feature.</span> |
| |
| <span class="sd"> Note that the mean/median/mode value is computed after filtering out missing values.</span> |
| <span class="sd"> All Null values in the input columns are treated as missing, and so are also imputed. For</span> |
| <span class="sd"> computing median, :py:meth:`pyspark.sql.DataFrame.approxQuantile` is used with a</span> |
| <span class="sd"> relative error of `0.001`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0),</span> |
| <span class="sd"> ... (4.0, 4.0), (5.0, 5.0)], ["a", "b"])</span> |
| <span class="sd"> >>> imputer = Imputer()</span> |
| <span class="sd"> >>> imputer.setInputCols(["a", "b"])</span> |
| <span class="sd"> Imputer...</span> |
| <span class="sd"> >>> imputer.setOutputCols(["out_a", "out_b"])</span> |
| <span class="sd"> Imputer...</span> |
| <span class="sd"> >>> imputer.getRelativeError()</span> |
| <span class="sd"> 0.001</span> |
| <span class="sd"> >>> model = imputer.fit(df)</span> |
| <span class="sd"> >>> model.setInputCols(["a", "b"])</span> |
| <span class="sd"> ImputerModel...</span> |
| <span class="sd"> >>> model.getStrategy()</span> |
| <span class="sd"> 'mean'</span> |
| <span class="sd"> >>> model.surrogateDF.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| b|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> |3.0|4.0|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model.transform(df).show()</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> | a| b|out_a|out_b|</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> |1.0|NaN| 1.0| 4.0|</span> |
| <span class="sd"> |2.0|NaN| 2.0| 4.0|</span> |
| <span class="sd"> |NaN|3.0| 3.0| 3.0|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> imputer.setStrategy("median").setMissingValue(1.0).fit(df).transform(df).show()</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> | a| b|out_a|out_b|</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> |1.0|NaN| 4.0| NaN|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1.0,), (2.0,), (float("nan"),), (4.0,), (5.0,)], ["a"])</span> |
| <span class="sd"> >>> imputer1 = Imputer(inputCol="a", outputCol="out_a")</span> |
| <span class="sd"> >>> model1 = imputer1.fit(df1)</span> |
| <span class="sd"> >>> model1.surrogateDF.show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | a|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |3.0|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model1.transform(df1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | a|out_a|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |1.0| 1.0|</span> |
| <span class="sd"> |2.0| 2.0|</span> |
| <span class="sd"> |NaN| 3.0|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> imputer1.setStrategy("median").setMissingValue(1.0).fit(df1).transform(df1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | a|out_a|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |1.0| 4.0|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(float("nan"),), (float("nan"),), (3.0,), (4.0,), (5.0,)],</span> |
| <span class="sd"> ... ["b"])</span> |
| <span class="sd"> >>> imputer2 = Imputer(inputCol="b", outputCol="out_b")</span> |
| <span class="sd"> >>> model2 = imputer2.fit(df2)</span> |
| <span class="sd"> >>> model2.surrogateDF.show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | b|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |4.0|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model2.transform(df2).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | b|out_b|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |NaN| 4.0|</span> |
| <span class="sd"> |NaN| 4.0|</span> |
| <span class="sd"> |3.0| 3.0|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> imputer2.setStrategy("median").setMissingValue(1.0).fit(df2).transform(df2).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | b|out_b|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |NaN| NaN|</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> imputerPath = temp_path + "/imputer"</span> |
| <span class="sd"> >>> imputer.save(imputerPath)</span> |
| <span class="sd"> >>> loadedImputer = Imputer.load(imputerPath)</span> |
| <span class="sd"> >>> loadedImputer.getStrategy() == imputer.getStrategy()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedImputer.getMissingValue()</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> >>> modelPath = temp_path + "/imputer-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = ImputerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.transform(df).head().out_a == model.transform(df).head().out_a</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"mean"</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="s2">"nan"</span><span class="p">),</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, strategy="mean", missingValue=float("nan"), inputCols=None, \</span> |
| <span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Imputer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Imputer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="Imputer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"mean"</span><span class="p">,</span> |
| <span class="n">missingValue</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="s2">"nan"</span><span class="p">),</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, strategy="mean", missingValue=float("nan"), inputCols=None, \</span> |
| <span class="sd"> outputCols=None, inputCol=None, outputCol=None, relativeError=0.001)</span> |
| <span class="sd"> Sets params for this Imputer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setStrategy"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setStrategy">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStrategy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`strategy`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">strategy</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setMissingValue"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setMissingValue">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMissingValue</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`missingValue`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">missingValue</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Imputer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Imputer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`relativeError`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ImputerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">ImputerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ImputerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel">[docs]</a><span class="k">class</span> <span class="nc">ImputerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_ImputerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"ImputerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`Imputer`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="ImputerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"ImputerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ImputerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"ImputerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ImputerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ImputerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="ImputerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ImputerModel.html#pyspark.ml.feature.ImputerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ImputerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">surrogateDF</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a DataFrame containing inputCols and their corresponding surrogates,</span> |
| <span class="sd"> which are used to replace the missing values in the input DataFrame.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"surrogateDF"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="Interaction"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Interaction</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Interaction"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Implements the feature interaction transform. This transformer takes in Double and Vector type</span> |
| <span class="sd"> columns and outputs a flattened vector of their feature interactions. To handle interaction,</span> |
| <span class="sd"> we first one-hot encode any nominal features. Then, a vector of the feature cross-products is</span> |
| <span class="sd"> produced.</span> |
| |
| <span class="sd"> For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be</span> |
| <span class="sd"> `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal</span> |
| <span class="sd"> with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])</span> |
| <span class="sd"> >>> interaction = Interaction()</span> |
| <span class="sd"> >>> interaction.setInputCols(["a", "b"])</span> |
| <span class="sd"> Interaction...</span> |
| <span class="sd"> >>> interaction.setOutputCol("ab")</span> |
| <span class="sd"> Interaction...</span> |
| <span class="sd"> >>> interaction.transform(df).show()</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> | a| b| ab|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> |0.0|1.0|[0.0]|</span> |
| <span class="sd"> |2.0|3.0|[6.0]|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> interactionPath = temp_path + "/interaction"</span> |
| <span class="sd"> >>> interaction.save(interactionPath)</span> |
| <span class="sd"> >>> loadedInteraction = Interaction.load(interactionPath)</span> |
| <span class="sd"> >>> loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None):</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Interaction</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Interaction"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Interaction.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Interaction"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this Interaction.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Interaction.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Interaction"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Interaction.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Interaction.html#pyspark.ml.feature.Interaction.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Interaction"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_MaxAbsScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">pass</span> |
| |
| |
| <div class="viewcode-block" id="MaxAbsScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">MaxAbsScaler</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"MaxAbsScalerModel"</span><span class="p">],</span> |
| <span class="n">_MaxAbsScalerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"MaxAbsScaler"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Rescale each feature individually to range [-1, 1] by dividing through the largest maximum</span> |
| <span class="sd"> absolute value in each feature. It does not shift/center the data, and thus does not destroy</span> |
| <span class="sd"> any sparsity.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])</span> |
| <span class="sd"> >>> maScaler = MaxAbsScaler(outputCol="scaled")</span> |
| <span class="sd"> >>> maScaler.setInputCol("a")</span> |
| <span class="sd"> MaxAbsScaler...</span> |
| <span class="sd"> >>> model = maScaler.fit(df)</span> |
| <span class="sd"> >>> model.setOutputCol("scaledOutput")</span> |
| <span class="sd"> MaxAbsScalerModel...</span> |
| <span class="sd"> >>> model.transform(df).show()</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> | a|scaledOutput|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> |[1.0]| [0.5]|</span> |
| <span class="sd"> |[2.0]| [1.0]|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> scalerPath = temp_path + "/max-abs-scaler"</span> |
| <span class="sd"> >>> maScaler.save(scalerPath)</span> |
| <span class="sd"> >>> loadedMAScaler = MaxAbsScaler.load(scalerPath)</span> |
| <span class="sd"> >>> loadedMAScaler.getInputCol() == maScaler.getInputCol()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedMAScaler.getOutputCol() == maScaler.getOutputCol()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/max-abs-scaler-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = MaxAbsScalerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.maxAbs == model.maxAbs</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">MaxAbsScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.MaxAbsScaler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">()</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="MaxAbsScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this MaxAbsScaler.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MaxAbsScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MaxAbsScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScaler.html#pyspark.ml.feature.MaxAbsScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScalerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MaxAbsScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="MaxAbsScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MaxAbsScalerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MaxAbsScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"MaxAbsScalerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`MaxAbsScaler`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="MaxAbsScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MaxAbsScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MaxAbsScalerModel.html#pyspark.ml.feature.MaxAbsScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MaxAbsScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">maxAbs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Max Abs vector.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"maxAbs"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="MinHashLSH"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">MinHashLSH</span><span class="p">(</span> |
| <span class="n">_LSH</span><span class="p">[</span><span class="s2">"MinHashLSHModel"</span><span class="p">],</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasSeed</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"MinHashLSH"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> LSH class for Jaccard distance.</span> |
| <span class="sd"> The input can be dense or sparse vectors, but it is more efficient if it is sparse.</span> |
| <span class="sd"> For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements</span> |
| <span class="sd"> in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at</span> |
| <span class="sd"> least 1 non-zero index, and all non-zero values are treated as binary "1" values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> See `Wikipedia on MinHash <https://en.wikipedia.org/wiki/MinHash>`_</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),</span> |
| <span class="sd"> ... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),</span> |
| <span class="sd"> ... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["id", "features"])</span> |
| <span class="sd"> >>> mh = MinHashLSH()</span> |
| <span class="sd"> >>> mh.setInputCol("features")</span> |
| <span class="sd"> MinHashLSH...</span> |
| <span class="sd"> >>> mh.setOutputCol("hashes")</span> |
| <span class="sd"> MinHashLSH...</span> |
| <span class="sd"> >>> mh.setSeed(12345)</span> |
| <span class="sd"> MinHashLSH...</span> |
| <span class="sd"> >>> model = mh.fit(df)</span> |
| <span class="sd"> >>> model.setInputCol("features")</span> |
| <span class="sd"> MinHashLSHModel...</span> |
| <span class="sd"> >>> model.transform(df).head()</span> |
| <span class="sd"> Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668...</span> |
| <span class="sd"> >>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),</span> |
| <span class="sd"> ... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),</span> |
| <span class="sd"> ... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame(data2, ["id", "features"])</span> |
| <span class="sd"> >>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0])</span> |
| <span class="sd"> >>> model.approxNearestNeighbors(df2, key, 1).collect()</span> |
| <span class="sd"> [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668...</span> |
| <span class="sd"> >>> model.approxSimilarityJoin(df, df2, 0.6, distCol="JaccardDistance").select(</span> |
| <span class="sd"> ... col("datasetA.id").alias("idA"),</span> |
| <span class="sd"> ... col("datasetB.id").alias("idB"),</span> |
| <span class="sd"> ... col("JaccardDistance")).show()</span> |
| <span class="sd"> +---+---+---------------+</span> |
| <span class="sd"> |idA|idB|JaccardDistance|</span> |
| <span class="sd"> +---+---+---------------+</span> |
| <span class="sd"> | 0| 5| 0.5|</span> |
| <span class="sd"> | 1| 4| 0.5|</span> |
| <span class="sd"> +---+---+---------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> mhPath = temp_path + "/mh"</span> |
| <span class="sd"> >>> mh.save(mhPath)</span> |
| <span class="sd"> >>> mh2 = MinHashLSH.load(mhPath)</span> |
| <span class="sd"> >>> mh2.getOutputCol() == mh.getOutputCol()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/mh-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> model2 = MinHashLSHModel.load(modelPath)</span> |
| <span class="sd"> >>> model.transform(df).head().hashes == model2.transform(df).head().hashes</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">MinHashLSH</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.MinHashLSH"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="MinHashLSH.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numHashTables</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"MinHashLSH"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)</span> |
| <span class="sd"> Sets params for this MinHashLSH.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinHashLSH.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSH.html#pyspark.ml.feature.MinHashLSH.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinHashLSH"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinHashLSHModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MinHashLSHModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="MinHashLSHModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinHashLSHModel.html#pyspark.ml.feature.MinHashLSHModel">[docs]</a><span class="k">class</span> <span class="nc">MinHashLSHModel</span><span class="p">(</span><span class="n">_LSHModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each</span> |
| <span class="sd"> hash function is picked from the following family of hash functions, where :math:`a_i` and</span> |
| <span class="sd"> :math:`b_i` are randomly chosen integers less than prime:</span> |
| <span class="sd"> :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise</span> |
| <span class="sd"> independent according to the reference.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> See Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."</span> |
| <span class="sd"> Electronic Journal of Combinatorics 7 (2000): R26.</span> |
| <span class="sd"> """</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_MinMaxScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nb">min</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"min"</span><span class="p">,</span> |
| <span class="s2">"Lower bound of the output feature range"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="nb">max</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"max"</span><span class="p">,</span> |
| <span class="s2">"Upper bound of the output feature range"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of min or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">min</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of max or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">max</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="MinMaxScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">MinMaxScaler</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"MinMaxScalerModel"</span><span class="p">],</span> |
| <span class="n">_MinMaxScalerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"MinMaxScaler"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Rescale each feature individually to a common range [min, max] linearly using column summary</span> |
| <span class="sd"> statistics, which is also known as min-max normalization or Rescaling. The rescaled value for</span> |
| <span class="sd"> feature E is calculated as,</span> |
| |
| <span class="sd"> Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min</span> |
| |
| <span class="sd"> For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Since zero values will probably be transformed to non-zero values, output of the</span> |
| <span class="sd"> transformer will be DenseVector even for sparse input.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])</span> |
| <span class="sd"> >>> mmScaler = MinMaxScaler(outputCol="scaled")</span> |
| <span class="sd"> >>> mmScaler.setInputCol("a")</span> |
| <span class="sd"> MinMaxScaler...</span> |
| <span class="sd"> >>> model = mmScaler.fit(df)</span> |
| <span class="sd"> >>> model.setOutputCol("scaledOutput")</span> |
| <span class="sd"> MinMaxScalerModel...</span> |
| <span class="sd"> >>> model.originalMin</span> |
| <span class="sd"> DenseVector([0.0])</span> |
| <span class="sd"> >>> model.originalMax</span> |
| <span class="sd"> DenseVector([2.0])</span> |
| <span class="sd"> >>> model.transform(df).show()</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> | a|scaledOutput|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> |[0.0]| [0.0]|</span> |
| <span class="sd"> |[2.0]| [1.0]|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> minMaxScalerPath = temp_path + "/min-max-scaler"</span> |
| <span class="sd"> >>> mmScaler.save(minMaxScalerPath)</span> |
| <span class="sd"> >>> loadedMMScaler = MinMaxScaler.load(minMaxScalerPath)</span> |
| <span class="sd"> >>> loadedMMScaler.getMin() == mmScaler.getMin()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedMMScaler.getMax() == mmScaler.getMax()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/min-max-scaler-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = MinMaxScalerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.originalMin == model.originalMin</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.originalMax == model.originalMax</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="nb">min</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="nb">max</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">MinMaxScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.MinMaxScaler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="MinMaxScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="nb">min</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="nb">max</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this MinMaxScaler.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScaler.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`min`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScaler.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`max`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScaler.html#pyspark.ml.feature.MinMaxScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScalerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MinMaxScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="MinMaxScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel">[docs]</a><span class="k">class</span> <span class="nc">MinMaxScalerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_MinMaxScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"MinMaxScalerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`MinMaxScaler`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="MinMaxScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScalerModel.setMin"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMin">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`min`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="MinMaxScalerModel.setMax"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.MinMaxScalerModel.html#pyspark.ml.feature.MinMaxScalerModel.setMax">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"MinMaxScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`max`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="nb">max</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">originalMin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Min value for each original column during fitting.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"originalMin"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">originalMax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Max value for each original column during fitting.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"originalMax"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="NGram"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">NGram</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"NGram"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A feature transformer that converts the input array of strings into an array of n-grams. Null</span> |
| <span class="sd"> values in the input array are ignored.</span> |
| <span class="sd"> It returns an array of n-grams where each n-gram is represented by a space-separated string of</span> |
| <span class="sd"> words.</span> |
| <span class="sd"> When the input is empty, an empty array is returned.</span> |
| <span class="sd"> When the input array length is less than n (number of elements per n-gram), no n-grams are</span> |
| <span class="sd"> returned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])</span> |
| <span class="sd"> >>> ngram = NGram(n=2)</span> |
| <span class="sd"> >>> ngram.setInputCol("inputTokens")</span> |
| <span class="sd"> NGram...</span> |
| <span class="sd"> >>> ngram.setOutputCol("nGrams")</span> |
| <span class="sd"> NGram...</span> |
| <span class="sd"> >>> ngram.transform(df).head()</span> |
| <span class="sd"> Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e'])</span> |
| <span class="sd"> >>> # Change n-gram length</span> |
| <span class="sd"> >>> ngram.setParams(n=4).transform(df).head()</span> |
| <span class="sd"> Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])</span> |
| <span class="sd"> >>> # Temporarily modify output column.</span> |
| <span class="sd"> >>> ngram.transform(df, {ngram.outputCol: "output"}).head()</span> |
| <span class="sd"> Row(inputTokens=['a', 'b', 'c', 'd', 'e'], output=['a b c d', 'b c d e'])</span> |
| <span class="sd"> >>> ngram.transform(df).head()</span> |
| <span class="sd"> Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])</span> |
| <span class="sd"> >>> # Must use keyword arguments to specify params.</span> |
| <span class="sd"> >>> ngram.setParams("text")</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> TypeError: Method setParams forces keyword arguments.</span> |
| <span class="sd"> >>> ngramPath = temp_path + "/ngram"</span> |
| <span class="sd"> >>> ngram.save(ngramPath)</span> |
| <span class="sd"> >>> loadedNGram = NGram.load(ngramPath)</span> |
| <span class="sd"> >>> loadedNGram.getN() == ngram.getN()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedNGram.transform(df).take(1) == ngram.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">n</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"n"</span><span class="p">,</span> |
| <span class="s2">"number of elements per n-gram (>=1)"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, n=2, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">NGram</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.NGram"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="NGram.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"NGram"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, n=2, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this NGram.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="NGram.setN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setN</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"NGram"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`n`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="NGram.getN"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.getN">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getN</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of n or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="NGram.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"NGram"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="NGram.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.NGram.html#pyspark.ml.feature.NGram.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"NGram"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="Normalizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Normalizer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Normalizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Normalize a vector to have unit norm using the given p-norm.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])</span> |
| <span class="sd"> >>> normalizer = Normalizer(p=2.0)</span> |
| <span class="sd"> >>> normalizer.setInputCol("dense")</span> |
| <span class="sd"> Normalizer...</span> |
| <span class="sd"> >>> normalizer.setOutputCol("features")</span> |
| <span class="sd"> Normalizer...</span> |
| <span class="sd"> >>> normalizer.transform(df).head().features</span> |
| <span class="sd"> DenseVector([0.6, -0.8])</span> |
| <span class="sd"> >>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs</span> |
| <span class="sd"> SparseVector(4, {1: 0.8, 3: 0.6})</span> |
| <span class="sd"> >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}</span> |
| <span class="sd"> >>> normalizer.transform(df, params).head().vector</span> |
| <span class="sd"> DenseVector([0.4286, -0.5714])</span> |
| <span class="sd"> >>> normalizerPath = temp_path + "/normalizer"</span> |
| <span class="sd"> >>> normalizer.save(normalizerPath)</span> |
| <span class="sd"> >>> loadedNormalizer = Normalizer.load(normalizerPath)</span> |
| <span class="sd"> >>> loadedNormalizer.getP() == normalizer.getP()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">p</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">"p"</span><span class="p">,</span> <span class="s2">"the p norm value."</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, p=2.0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Normalizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Normalizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Normalizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">p</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Normalizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, p=2.0, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this Normalizer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Normalizer.setP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setP</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Normalizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`p`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Normalizer.getP"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.getP">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getP</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of p or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">p</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Normalizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Normalizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Normalizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Normalizer.html#pyspark.ml.feature.Normalizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Normalizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_OneHotEncoderParams</span><span class="p">(</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasOutputCols</span><span class="p">,</span> <span class="n">HasHandleInvalid</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"How to handle invalid data during "</span> |
| <span class="o">+</span> <span class="s2">"transform(). Options are 'keep' (invalid data presented as an extra "</span> |
| <span class="o">+</span> <span class="s2">"categorical feature) or error (throw an error). Note that this Param "</span> |
| <span class="o">+</span> <span class="s2">"is only used during transform; during fitting, invalid data will "</span> |
| <span class="o">+</span> <span class="s2">"result in an error."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">dropLast</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"dropLast"</span><span class="p">,</span> |
| <span class="s2">"whether to drop the last category"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of dropLast or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="OneHotEncoder"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">OneHotEncoder</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"OneHotEncoderModel"</span><span class="p">],</span> |
| <span class="n">_OneHotEncoderParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"OneHotEncoder"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A one-hot encoder that maps a column of category indices to a column of binary vectors, with</span> |
| <span class="sd"> at most a single one-value per row that indicates the input category index.</span> |
| <span class="sd"> For example with 5 categories, an input value of 2.0 would map to an output vector of</span> |
| <span class="sd"> `[0.0, 0.0, 1.0, 0.0]`.</span> |
| <span class="sd"> The last category is not included by default (configurable via :py:attr:`dropLast`),</span> |
| <span class="sd"> because it makes the vector entries sum up to one, and hence linearly dependent.</span> |
| <span class="sd"> So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.</span> |
| |
| <span class="sd"> When :py:attr:`handleInvalid` is configured to 'keep', an extra "category" indicating invalid</span> |
| <span class="sd"> values is added as last category. So when :py:attr:`dropLast` is true, invalid values are</span> |
| <span class="sd"> encoded as all-zeros vector.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is different from scikit-learn's OneHotEncoder, which keeps all categories.</span> |
| <span class="sd"> The output vectors are sparse.</span> |
| |
| <span class="sd"> When encoding multi-column by using :py:attr:`inputCols` and</span> |
| <span class="sd"> :py:attr:`outputCols` params, input/output cols come in pairs, specified by the order in</span> |
| <span class="sd"> the arrays, and each pair is treated independently.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> StringIndexer : for converting categorical values into category indices</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])</span> |
| <span class="sd"> >>> ohe = OneHotEncoder()</span> |
| <span class="sd"> >>> ohe.setInputCols(["input"])</span> |
| <span class="sd"> OneHotEncoder...</span> |
| <span class="sd"> >>> ohe.setOutputCols(["output"])</span> |
| <span class="sd"> OneHotEncoder...</span> |
| <span class="sd"> >>> model = ohe.fit(df)</span> |
| <span class="sd"> >>> model.setOutputCols(["output"])</span> |
| <span class="sd"> OneHotEncoderModel...</span> |
| <span class="sd"> >>> model.getHandleInvalid()</span> |
| <span class="sd"> 'error'</span> |
| <span class="sd"> >>> model.transform(df).head().output</span> |
| <span class="sd"> SparseVector(2, {0: 1.0})</span> |
| <span class="sd"> >>> single_col_ohe = OneHotEncoder(inputCol="input", outputCol="output")</span> |
| <span class="sd"> >>> single_col_model = single_col_ohe.fit(df)</span> |
| <span class="sd"> >>> single_col_model.transform(df).head().output</span> |
| <span class="sd"> SparseVector(2, {0: 1.0})</span> |
| <span class="sd"> >>> ohePath = temp_path + "/ohe"</span> |
| <span class="sd"> >>> ohe.save(ohePath)</span> |
| <span class="sd"> >>> loadedOHE = OneHotEncoder.load(ohePath)</span> |
| <span class="sd"> >>> loadedOHE.getInputCols() == ohe.getInputCols()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/ohe-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = OneHotEncoderModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.categorySizes == model.categorySizes</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True, \</span> |
| <span class="sd"> inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">OneHotEncoder</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.OneHotEncoder"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">dropLast</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCols=None, outputCols=None, handleInvalid="error", \</span> |
| <span class="sd"> dropLast=True, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this OneHotEncoder.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`dropLast`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoder.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoder.html#pyspark.ml.feature.OneHotEncoder.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">OneHotEncoderModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="OneHotEncoderModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel">[docs]</a><span class="k">class</span> <span class="nc">OneHotEncoderModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_OneHotEncoderParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"OneHotEncoderModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`OneHotEncoder`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setDropLast"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setDropLast">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`dropLast`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="OneHotEncoderModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.OneHotEncoderModel.html#pyspark.ml.feature.OneHotEncoderModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"OneHotEncoderModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">categorySizes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Original number of categories for each feature being encoded.</span> |
| <span class="sd"> The array contains one value for each input column, in order.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"categorySizes"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="PolynomialExpansion"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">PolynomialExpansion</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"PolynomialExpansion"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion</span> |
| <span class="sd"> <http://en.wikipedia.org/wiki/Polynomial_expansion>`_, "In mathematics, an</span> |
| <span class="sd"> expansion of a product of sums expresses it as a sum of products by using the fact that</span> |
| <span class="sd"> multiplication distributes over addition". Take a 2-variable feature vector as an example:</span> |
| <span class="sd"> `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"])</span> |
| <span class="sd"> >>> px = PolynomialExpansion(degree=2)</span> |
| <span class="sd"> >>> px.setInputCol("dense")</span> |
| <span class="sd"> PolynomialExpansion...</span> |
| <span class="sd"> >>> px.setOutputCol("expanded")</span> |
| <span class="sd"> PolynomialExpansion...</span> |
| <span class="sd"> >>> px.transform(df).head().expanded</span> |
| <span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span> |
| <span class="sd"> >>> px.setParams(outputCol="test").transform(df).head().test</span> |
| <span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span> |
| <span class="sd"> >>> polyExpansionPath = temp_path + "/poly-expansion"</span> |
| <span class="sd"> >>> px.save(polyExpansionPath)</span> |
| <span class="sd"> >>> loadedPx = PolynomialExpansion.load(polyExpansionPath)</span> |
| <span class="sd"> >>> loadedPx.getDegree() == px.getDegree()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedPx.transform(df).take(1) == px.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">degree</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"degree"</span><span class="p">,</span> |
| <span class="s2">"the polynomial degree to expand (>= 1)"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, degree=2, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">PolynomialExpansion</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.PolynomialExpansion"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="PolynomialExpansion.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">degree</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"PolynomialExpansion"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, degree=2, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this PolynomialExpansion.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PolynomialExpansion.setDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PolynomialExpansion"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`degree`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PolynomialExpansion.getDegree"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.getDegree">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of degree or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PolynomialExpansion.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PolynomialExpansion"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PolynomialExpansion.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PolynomialExpansion.html#pyspark.ml.feature.PolynomialExpansion.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PolynomialExpansion"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="QuantileDiscretizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">QuantileDiscretizer</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCols</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">HasRelativeError</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"QuantileDiscretizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> :py:class:`QuantileDiscretizer` takes a column with continuous features and outputs a column</span> |
| <span class="sd"> with binned categorical features. The number of bins can be set using the :py:attr:`numBuckets`</span> |
| <span class="sd"> parameter. It is possible that the number of buckets used will be less than this value, for</span> |
| <span class="sd"> example, if there are too few distinct values of the input to create enough distinct quantiles.</span> |
| <span class="sd"> Since 3.0.0, :py:class:`QuantileDiscretizer` can map multiple columns at once by setting the</span> |
| <span class="sd"> :py:attr:`inputCols` parameter. If both of the :py:attr:`inputCol` and :py:attr:`inputCols`</span> |
| <span class="sd"> parameters are set, an Exception will be thrown. To specify the number of buckets for each</span> |
| <span class="sd"> column, the :py:attr:`numBucketsArray` parameter can be set, or if the number of buckets</span> |
| <span class="sd"> should be the same across columns, :py:attr:`numBuckets` can be set as a convenience.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> NaN handling: Note also that</span> |
| <span class="sd"> :py:class:`QuantileDiscretizer` will raise an error when it finds NaN values in the dataset,</span> |
| <span class="sd"> but the user can also choose to either keep or remove NaN values within the dataset by setting</span> |
| <span class="sd"> :py:attr:`handleInvalid` parameter. If the user chooses to keep NaN values, they will be</span> |
| <span class="sd"> handled specially and placed into their own bucket, for example, if 4 buckets are used, then</span> |
| <span class="sd"> non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].</span> |
| |
| <span class="sd"> Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for</span> |
| <span class="sd"> :py:meth:`pyspark.sql.DataFrameStatFunctions.approxQuantile` for a detailed description).</span> |
| <span class="sd"> The precision of the approximation can be controlled with the</span> |
| <span class="sd"> :py:attr:`relativeError` parameter.</span> |
| <span class="sd"> The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame(values, ["values"])</span> |
| <span class="sd"> >>> qds1 = QuantileDiscretizer(inputCol="values", outputCol="buckets")</span> |
| <span class="sd"> >>> qds1.setNumBuckets(2)</span> |
| <span class="sd"> QuantileDiscretizer...</span> |
| <span class="sd"> >>> qds1.setRelativeError(0.01)</span> |
| <span class="sd"> QuantileDiscretizer...</span> |
| <span class="sd"> >>> qds1.setHandleInvalid("error")</span> |
| <span class="sd"> QuantileDiscretizer...</span> |
| <span class="sd"> >>> qds1.getRelativeError()</span> |
| <span class="sd"> 0.01</span> |
| <span class="sd"> >>> bucketizer = qds1.fit(df1)</span> |
| <span class="sd"> >>> qds1.setHandleInvalid("keep").fit(df1).transform(df1).count()</span> |
| <span class="sd"> 6</span> |
| <span class="sd"> >>> qds1.setHandleInvalid("skip").fit(df1).transform(df1).count()</span> |
| <span class="sd"> 4</span> |
| <span class="sd"> >>> splits = bucketizer.getSplits()</span> |
| <span class="sd"> >>> splits[0]</span> |
| <span class="sd"> -inf</span> |
| <span class="sd"> >>> print("%2.1f" % round(splits[1], 1))</span> |
| <span class="sd"> 0.4</span> |
| <span class="sd"> >>> bucketed = bucketizer.transform(df1).head()</span> |
| <span class="sd"> >>> bucketed.buckets</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> >>> quantileDiscretizerPath = temp_path + "/quantile-discretizer"</span> |
| <span class="sd"> >>> qds1.save(quantileDiscretizerPath)</span> |
| <span class="sd"> >>> loadedQds = QuantileDiscretizer.load(quantileDiscretizerPath)</span> |
| <span class="sd"> >>> loadedQds.getNumBuckets() == qds1.getNumBuckets()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> inputs = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, 1.5),</span> |
| <span class="sd"> ... (float("nan"), float("nan")), (float("nan"), float("nan"))]</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame(inputs, ["input1", "input2"])</span> |
| <span class="sd"> >>> qds2 = QuantileDiscretizer(relativeError=0.01, handleInvalid="error", numBuckets=2,</span> |
| <span class="sd"> ... inputCols=["input1", "input2"], outputCols=["output1", "output2"])</span> |
| <span class="sd"> >>> qds2.getRelativeError()</span> |
| <span class="sd"> 0.01</span> |
| <span class="sd"> >>> qds2.setHandleInvalid("keep").fit(df2).transform(df2).show()</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> |input1|input2|output1|output2|</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> | 0.1| 0.0| 0.0| 0.0|</span> |
| <span class="sd"> | 0.4| 1.0| 1.0| 1.0|</span> |
| <span class="sd"> | 1.2| 1.3| 1.0| 1.0|</span> |
| <span class="sd"> | 1.5| 1.5| 1.0| 1.0|</span> |
| <span class="sd"> | NaN| NaN| 2.0| 2.0|</span> |
| <span class="sd"> | NaN| NaN| 2.0| 2.0|</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> qds3 = QuantileDiscretizer(relativeError=0.01, handleInvalid="error",</span> |
| <span class="sd"> ... numBucketsArray=[5, 10], inputCols=["input1", "input2"],</span> |
| <span class="sd"> ... outputCols=["output1", "output2"])</span> |
| <span class="sd"> >>> qds3.setHandleInvalid("skip").fit(df2).transform(df2).show()</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> |input1|input2|output1|output2|</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> | 0.1| 0.0| 1.0| 1.0|</span> |
| <span class="sd"> | 0.4| 1.0| 2.0| 2.0|</span> |
| <span class="sd"> | 1.2| 1.3| 3.0| 3.0|</span> |
| <span class="sd"> | 1.5| 1.5| 4.0| 4.0|</span> |
| <span class="sd"> +------+------+-------+-------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">numBuckets</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"numBuckets"</span><span class="p">,</span> |
| <span class="s2">"Maximum number of buckets (quantiles, or "</span> |
| <span class="o">+</span> <span class="s2">"categories) into which data points are grouped. Must be >= 2."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"how to handle invalid entries. "</span> |
| <span class="o">+</span> <span class="s2">"Options are skip (filter out rows with invalid values), "</span> |
| <span class="o">+</span> <span class="s2">"error (throw an error), or keep (keep invalid values in a special "</span> |
| <span class="o">+</span> <span class="s2">"additional bucket). Note that in the multiple columns "</span> |
| <span class="o">+</span> <span class="s2">"case, the invalid handling is applied to all columns. That said "</span> |
| <span class="o">+</span> <span class="s2">"for 'error' it will throw an error if any invalids are found in "</span> |
| <span class="o">+</span> <span class="s2">"any columns, for 'skip' it will skip rows with any invalids in "</span> |
| <span class="o">+</span> <span class="s2">"any columns, etc."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"numBucketsArray"</span><span class="p">,</span> |
| <span class="s2">"Array of number of buckets "</span> |
| <span class="o">+</span> <span class="s2">"(quantiles, or categories) into which data points are grouped. "</span> |
| <span class="o">+</span> <span class="s2">"This is for multiple columns input. If transforming multiple "</span> |
| <span class="o">+</span> <span class="s2">"columns and numBucketsArray is not set, but numBuckets is set, "</span> |
| <span class="o">+</span> <span class="s2">"then numBuckets will be applied across all columns."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span> |
| <span class="sd"> handleInvalid="error", numBucketsArray=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">QuantileDiscretizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.QuantileDiscretizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numBuckets</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">numBucketsArray</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \</span> |
| <span class="sd"> handleInvalid="error", numBucketsArray=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> Set the params for the QuantileDiscretizer</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numBuckets`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.getNumBuckets"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBuckets">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getNumBuckets</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of numBuckets or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBuckets</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numBucketsArray`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numBucketsArray</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.getNumBucketsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.getNumBucketsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getNumBucketsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of numBucketsArray or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numBucketsArray</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`relativeError`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="QuantileDiscretizer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.QuantileDiscretizer.html#pyspark.ml.feature.QuantileDiscretizer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"QuantileDiscretizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Bucketizer</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Private method to convert the java_model to a Python model.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">isSet</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">inputCol</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span> |
| <span class="n">splits</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplits</span><span class="p">()),</span> |
| <span class="n">inputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCol</span><span class="p">(),</span> |
| <span class="n">outputCol</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCol</span><span class="p">(),</span> |
| <span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">splitsArrayList</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">java_model</span><span class="o">.</span><span class="n">getSplitsArray</span><span class="p">())]</span> |
| <span class="k">return</span> <span class="n">Bucketizer</span><span class="p">(</span> |
| <span class="n">splitsArray</span><span class="o">=</span><span class="n">splitsArrayList</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getInputCols</span><span class="p">(),</span> |
| <span class="n">outputCols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getOutputCols</span><span class="p">(),</span> |
| <span class="n">handleInvalid</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">getHandleInvalid</span><span class="p">(),</span> |
| <span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_RobustScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasRelativeError</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`RobustScaler` and :py:class:`RobustScalerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">lower</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"lower"</span><span class="p">,</span> |
| <span class="s2">"Lower quantile to calculate quantile range"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">upper</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"upper"</span><span class="p">,</span> |
| <span class="s2">"Upper quantile to calculate quantile range"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">withCentering</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"withCentering"</span><span class="p">,</span> |
| <span class="s2">"Whether to center data with median"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">withScaling</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"withScaling"</span><span class="p">,</span> |
| <span class="s2">"Whether to scale the data to "</span> <span class="s2">"quantile range"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">lower</span><span class="o">=</span><span class="mf">0.25</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="mf">0.75</span><span class="p">,</span> <span class="n">withCentering</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withScaling</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">relativeError</span><span class="o">=</span><span class="mf">0.001</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLower</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of lower or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lower</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of upper or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">upper</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of withCentering or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withCentering</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of withScaling or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withScaling</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="RobustScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">RobustScaler</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"RobustScaler"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> RobustScaler removes the median and scales the data according to the quantile range.</span> |
| <span class="sd"> The quantile range is by default IQR (Interquartile Range, quantile range between the</span> |
| <span class="sd"> 1st quartile = 25th quantile and the 3rd quartile = 75th quantile) but can be configured.</span> |
| <span class="sd"> Centering and scaling happen independently on each feature by computing the relevant</span> |
| <span class="sd"> statistics on the samples in the training set. Median and quantile range are then</span> |
| <span class="sd"> stored to be used on later data using the transform method.</span> |
| <span class="sd"> Note that NaN values are ignored in the computation of medians and ranges.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> data = [(0, Vectors.dense([0.0, 0.0]),),</span> |
| <span class="sd"> ... (1, Vectors.dense([1.0, -1.0]),),</span> |
| <span class="sd"> ... (2, Vectors.dense([2.0, -2.0]),),</span> |
| <span class="sd"> ... (3, Vectors.dense([3.0, -3.0]),),</span> |
| <span class="sd"> ... (4, Vectors.dense([4.0, -4.0]),),]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["id", "features"])</span> |
| <span class="sd"> >>> scaler = RobustScaler()</span> |
| <span class="sd"> >>> scaler.setInputCol("features")</span> |
| <span class="sd"> RobustScaler...</span> |
| <span class="sd"> >>> scaler.setOutputCol("scaled")</span> |
| <span class="sd"> RobustScaler...</span> |
| <span class="sd"> >>> model = scaler.fit(df)</span> |
| <span class="sd"> >>> model.setOutputCol("output")</span> |
| <span class="sd"> RobustScalerModel...</span> |
| <span class="sd"> >>> model.median</span> |
| <span class="sd"> DenseVector([2.0, -2.0])</span> |
| <span class="sd"> >>> model.range</span> |
| <span class="sd"> DenseVector([2.0, 2.0])</span> |
| <span class="sd"> >>> model.transform(df).collect()[1].output</span> |
| <span class="sd"> DenseVector([0.5, -0.5])</span> |
| <span class="sd"> >>> scalerPath = temp_path + "/robust-scaler"</span> |
| <span class="sd"> >>> scaler.save(scalerPath)</span> |
| <span class="sd"> >>> loadedScaler = RobustScaler.load(scalerPath)</span> |
| <span class="sd"> >>> loadedScaler.getWithCentering() == scaler.getWithCentering()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedScaler.getWithScaling() == scaler.getWithScaling()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/robust-scaler-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = RobustScalerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.median == model.median</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.range == model.range</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">lower</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span> |
| <span class="n">upper</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.75</span><span class="p">,</span> |
| <span class="n">withCentering</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">withScaling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span> |
| <span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">RobustScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.RobustScaler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="RobustScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">lower</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span> |
| <span class="n">upper</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.75</span><span class="p">,</span> |
| <span class="n">withCentering</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">withScaling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.001</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, lower=0.25, upper=0.75, withCentering=False, withScaling=True, \</span> |
| <span class="sd"> inputCol=None, outputCol=None, relativeError=0.001)</span> |
| <span class="sd"> Sets params for this RobustScaler.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setLower"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setLower">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLower</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`lower`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">lower</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setUpper"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setUpper">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setUpper</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`upper`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">upper</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setWithCentering"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithCentering">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWithCentering</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`withCentering`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withCentering</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setWithScaling"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setWithScaling">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWithScaling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`withScaling`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withScaling</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScaler.setRelativeError"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScaler.html#pyspark.ml.feature.RobustScaler.setRelativeError">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setRelativeError</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`relativeError`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">relativeError</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScalerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">RobustScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="RobustScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel">[docs]</a><span class="k">class</span> <span class="nc">RobustScalerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RobustScalerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"RobustScalerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`RobustScaler`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="RobustScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RobustScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RobustScalerModel.html#pyspark.ml.feature.RobustScalerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RobustScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Median of the RobustScalerModel.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"median"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">range</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Quantile range of the RobustScalerModel.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"range"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="RegexTokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">RegexTokenizer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"RegexTokenizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A regex based tokenizer that extracts tokens either by using the</span> |
| <span class="sd"> provided regex pattern (in Java dialect) to split the text</span> |
| <span class="sd"> (default) or repeatedly matching the regex (if gaps is false).</span> |
| <span class="sd"> Optional parameters also allow filtering tokens using a minimal</span> |
| <span class="sd"> length.</span> |
| <span class="sd"> It returns an array of strings that can be empty.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("A B c",)], ["text"])</span> |
| <span class="sd"> >>> reTokenizer = RegexTokenizer()</span> |
| <span class="sd"> >>> reTokenizer.setInputCol("text")</span> |
| <span class="sd"> RegexTokenizer...</span> |
| <span class="sd"> >>> reTokenizer.setOutputCol("words")</span> |
| <span class="sd"> RegexTokenizer...</span> |
| <span class="sd"> >>> reTokenizer.transform(df).head()</span> |
| <span class="sd"> Row(text='A B c', words=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Change a parameter.</span> |
| <span class="sd"> >>> reTokenizer.setParams(outputCol="tokens").transform(df).head()</span> |
| <span class="sd"> Row(text='A B c', tokens=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Temporarily modify a parameter.</span> |
| <span class="sd"> >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()</span> |
| <span class="sd"> Row(text='A B c', words=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> reTokenizer.transform(df).head()</span> |
| <span class="sd"> Row(text='A B c', tokens=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Must use keyword arguments to specify params.</span> |
| <span class="sd"> >>> reTokenizer.setParams("text")</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> TypeError: Method setParams forces keyword arguments.</span> |
| <span class="sd"> >>> regexTokenizerPath = temp_path + "/regex-tokenizer"</span> |
| <span class="sd"> >>> reTokenizer.save(regexTokenizerPath)</span> |
| <span class="sd"> >>> loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)</span> |
| <span class="sd"> >>> loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedReTokenizer.getGaps() == reTokenizer.getGaps()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedReTokenizer.transform(df).take(1) == reTokenizer.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">minTokenLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minTokenLength"</span><span class="p">,</span> |
| <span class="s2">"minimum token length (>= 0)"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">gaps</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"gaps"</span><span class="p">,</span> |
| <span class="s2">"whether regex splits on gaps (True) or matches tokens "</span> <span class="o">+</span> <span class="s2">"(False)"</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">pattern</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"pattern"</span><span class="p">,</span> |
| <span class="s2">"regex pattern (Java dialect) used for tokenizing"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">toLowercase</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"toLowercase"</span><span class="p">,</span> |
| <span class="s2">"whether to convert all characters to "</span> <span class="o">+</span> <span class="s2">"lowercase before tokenizing"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minTokenLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">gaps</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\\</span><span class="s2">s+"</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">toLowercase</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \</span> |
| <span class="sd"> outputCol=None, toLowercase=True)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">RegexTokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.RegexTokenizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">"</span><span class="se">\\</span><span class="s2">s+"</span><span class="p">,</span> <span class="n">toLowercase</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">minTokenLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">gaps</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\\</span><span class="s2">s+"</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">toLowercase</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \</span> |
| <span class="sd"> outputCol=None, toLowercase=True)</span> |
| <span class="sd"> Sets params for this RegexTokenizer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minTokenLength`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.getMinTokenLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getMinTokenLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of minTokenLength or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`gaps`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">gaps</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.getGaps"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getGaps">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of gaps or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">gaps</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`pattern`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">pattern</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.getPattern"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getPattern">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of pattern or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pattern</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`toLowercase`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">toLowercase</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.getToLowercase"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.getToLowercase">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getToLowercase</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of toLowercase or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">toLowercase</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RegexTokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RegexTokenizer.html#pyspark.ml.feature.RegexTokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RegexTokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="SQLTransformer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">SQLTransformer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"SQLTransformer"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Implements the transforms which are defined by SQL statement.</span> |
| <span class="sd"> Currently we only support SQL syntax like `SELECT ... FROM __THIS__`</span> |
| <span class="sd"> where `__THIS__` represents the underlying table of the input dataset.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])</span> |
| <span class="sd"> >>> sqlTrans = SQLTransformer(</span> |
| <span class="sd"> ... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")</span> |
| <span class="sd"> >>> sqlTrans.transform(df).head()</span> |
| <span class="sd"> Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)</span> |
| <span class="sd"> >>> sqlTransformerPath = temp_path + "/sql-transformer"</span> |
| <span class="sd"> >>> sqlTrans.save(sqlTransformerPath)</span> |
| <span class="sd"> >>> loadedSqlTrans = SQLTransformer.load(sqlTransformerPath)</span> |
| <span class="sd"> >>> loadedSqlTrans.getStatement() == sqlTrans.getStatement()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedSqlTrans.transform(df).take(1) == sqlTrans.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">statement</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">"statement"</span><span class="p">,</span> <span class="s2">"SQL statement"</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, statement=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">SQLTransformer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.SQLTransformer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="SQLTransformer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SQLTransformer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, statement=None)</span> |
| <span class="sd"> Sets params for this SQLTransformer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SQLTransformer.setStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.setStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SQLTransformer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`statement`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">statement</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SQLTransformer.getStatement"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.SQLTransformer.html#pyspark.ml.feature.SQLTransformer.getStatement">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getStatement</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of statement or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">statement</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_StandardScalerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`StandardScaler` and :py:class:`StandardScalerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">withMean</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">"withMean"</span><span class="p">,</span> <span class="s2">"Center data with mean"</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span> |
| <span class="p">)</span> |
| <span class="n">withStd</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"withStd"</span><span class="p">,</span> |
| <span class="s2">"Scale to unit standard deviation"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_StandardScalerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of withMean or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withMean</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of withStd or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withStd</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="StandardScaler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">StandardScaler</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"StandardScalerModel"</span><span class="p">],</span> |
| <span class="n">_StandardScalerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"StandardScaler"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Standardizes features by removing the mean and scaling to unit variance using column summary</span> |
| <span class="sd"> statistics on the samples in the training set.</span> |
| |
| <span class="sd"> The "unit std" is computed using the `corrected sample standard deviation \</span> |
| <span class="sd"> <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_,</span> |
| <span class="sd"> which is computed as the square root of the unbiased sample variance.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])</span> |
| <span class="sd"> >>> standardScaler = StandardScaler()</span> |
| <span class="sd"> >>> standardScaler.setInputCol("a")</span> |
| <span class="sd"> StandardScaler...</span> |
| <span class="sd"> >>> standardScaler.setOutputCol("scaled")</span> |
| <span class="sd"> StandardScaler...</span> |
| <span class="sd"> >>> model = standardScaler.fit(df)</span> |
| <span class="sd"> >>> model.getInputCol()</span> |
| <span class="sd"> 'a'</span> |
| <span class="sd"> >>> model.setOutputCol("output")</span> |
| <span class="sd"> StandardScalerModel...</span> |
| <span class="sd"> >>> model.mean</span> |
| <span class="sd"> DenseVector([1.0])</span> |
| <span class="sd"> >>> model.std</span> |
| <span class="sd"> DenseVector([1.4142])</span> |
| <span class="sd"> >>> model.transform(df).collect()[1].output</span> |
| <span class="sd"> DenseVector([1.4142])</span> |
| <span class="sd"> >>> standardScalerPath = temp_path + "/standard-scaler"</span> |
| <span class="sd"> >>> standardScaler.save(standardScalerPath)</span> |
| <span class="sd"> >>> loadedStandardScaler = StandardScaler.load(standardScalerPath)</span> |
| <span class="sd"> >>> loadedStandardScaler.getWithMean() == standardScaler.getWithMean()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedStandardScaler.getWithStd() == standardScaler.getWithStd()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/standard-scaler-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = StandardScalerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.std == model.std</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.mean == model.mean</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">withMean</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">withStd</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">StandardScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.StandardScaler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="StandardScaler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">withMean</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">withStd</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, withMean=False, withStd=True, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this StandardScaler.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StandardScaler.setWithMean"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithMean">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`withMean`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StandardScaler.setWithStd"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setWithStd">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`withStd`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">withStd</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StandardScaler.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StandardScaler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScaler.html#pyspark.ml.feature.StandardScaler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScaler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScalerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">StandardScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="StandardScalerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel">[docs]</a><span class="k">class</span> <span class="nc">StandardScalerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_StandardScalerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"StandardScalerModel"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`StandardScaler`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="StandardScalerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StandardScalerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StandardScalerModel.html#pyspark.ml.feature.StandardScalerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StandardScalerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Standard deviation of the StandardScalerModel.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"std"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Vector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mean of the StandardScalerModel.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"mean"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_StringIndexerParams</span><span class="p">(</span> |
| <span class="n">JavaParams</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCols</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`StringIndexer` and :py:class:`StringIndexerModel`.</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"stringOrderType"</span><span class="p">,</span> |
| <span class="s2">"How to order labels of string column. The first label after "</span> |
| <span class="o">+</span> <span class="s2">"ordering is assigned an index of 0. Supported options: "</span> |
| <span class="o">+</span> <span class="s2">"frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. "</span> |
| <span class="o">+</span> <span class="s2">"Default is frequencyDesc. In case of equal frequency when "</span> |
| <span class="o">+</span> <span class="s2">"under frequencyDesc/Asc, the strings are further sorted "</span> |
| <span class="o">+</span> <span class="s2">"alphabetically"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"how to handle invalid data (unseen "</span> |
| <span class="o">+</span> <span class="s2">"or NULL values) in features and label column of string type. "</span> |
| <span class="o">+</span> <span class="s2">"Options are 'skip' (filter out rows with invalid data), "</span> |
| <span class="o">+</span> <span class="s2">"error (throw an error), or 'keep' (put invalid data "</span> |
| <span class="o">+</span> <span class="s2">"in a special additional bucket, at index numLabels)."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">,</span> <span class="n">stringOrderType</span><span class="o">=</span><span class="s2">"frequencyDesc"</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`stringOrderType` or its default value 'frequencyDesc'.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringOrderType</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="StringIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">StringIndexer</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"StringIndexerModel"</span><span class="p">],</span> |
| <span class="n">_StringIndexerParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"StringIndexer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A label indexer that maps a string column of labels to an ML column of label indices.</span> |
| <span class="sd"> If the input column is numeric, we cast it to string and index the string values.</span> |
| <span class="sd"> The indices are in [0, numLabels). By default, this is ordered by label frequencies</span> |
| <span class="sd"> so the most frequent label gets index 0. The ordering behavior is controlled by</span> |
| <span class="sd"> setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed",</span> |
| <span class="sd"> ... stringOrderType="frequencyDesc")</span> |
| <span class="sd"> >>> stringIndexer.setHandleInvalid("error")</span> |
| <span class="sd"> StringIndexer...</span> |
| <span class="sd"> >>> model = stringIndexer.fit(stringIndDf)</span> |
| <span class="sd"> >>> model.setHandleInvalid("error")</span> |
| <span class="sd"> StringIndexerModel...</span> |
| <span class="sd"> >>> td = model.transform(stringIndDf)</span> |
| <span class="sd"> >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span> |
| <span class="sd"> ... key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span> |
| <span class="sd"> >>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels)</span> |
| <span class="sd"> >>> itd = inverter.transform(td)</span> |
| <span class="sd"> >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),</span> |
| <span class="sd"> ... key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]</span> |
| <span class="sd"> >>> stringIndexerPath = temp_path + "/string-indexer"</span> |
| <span class="sd"> >>> stringIndexer.save(stringIndexerPath)</span> |
| <span class="sd"> >>> loadedIndexer = StringIndexer.load(stringIndexerPath)</span> |
| <span class="sd"> >>> loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/string-indexer-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = StringIndexerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.labels == model.labels</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> indexToStringPath = temp_path + "/index-to-string"</span> |
| <span class="sd"> >>> inverter.save(indexToStringPath)</span> |
| <span class="sd"> >>> loadedInverter = IndexToString.load(indexToStringPath)</span> |
| <span class="sd"> >>> loadedInverter.getLabels() == inverter.getLabels()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> stringIndexer.getStringOrderType()</span> |
| <span class="sd"> 'frequencyDesc'</span> |
| <span class="sd"> >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error",</span> |
| <span class="sd"> ... stringOrderType="alphabetDesc")</span> |
| <span class="sd"> >>> model = stringIndexer.fit(stringIndDf)</span> |
| <span class="sd"> >>> td = model.transform(stringIndDf)</span> |
| <span class="sd"> >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span> |
| <span class="sd"> ... key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]</span> |
| <span class="sd"> >>> fromlabelsModel = StringIndexerModel.from_labels(["a", "b", "c"],</span> |
| <span class="sd"> ... inputCol="label", outputCol="indexed", handleInvalid="error")</span> |
| <span class="sd"> >>> result = fromlabelsModel.transform(stringIndDf)</span> |
| <span class="sd"> >>> sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]),</span> |
| <span class="sd"> ... key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)]</span> |
| <span class="sd"> >>> testData = sc.parallelize([Row(id=0, label1="a", label2="e"),</span> |
| <span class="sd"> ... Row(id=1, label1="b", label2="f"),</span> |
| <span class="sd"> ... Row(id=2, label1="c", label2="e"),</span> |
| <span class="sd"> ... Row(id=3, label1="a", label2="f"),</span> |
| <span class="sd"> ... Row(id=4, label1="a", label2="f"),</span> |
| <span class="sd"> ... Row(id=5, label1="c", label2="f")], 3)</span> |
| <span class="sd"> >>> multiRowDf = spark.createDataFrame(testData)</span> |
| <span class="sd"> >>> inputs = ["label1", "label2"]</span> |
| <span class="sd"> >>> outputs = ["index1", "index2"]</span> |
| <span class="sd"> >>> stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)</span> |
| <span class="sd"> >>> model = stringIndexer.fit(multiRowDf)</span> |
| <span class="sd"> >>> result = model.transform(multiRowDf)</span> |
| <span class="sd"> >>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span> |
| <span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0)]</span> |
| <span class="sd"> >>> fromlabelsModel = StringIndexerModel.from_arrays_of_labels([["a", "b", "c"], ["e", "f"]],</span> |
| <span class="sd"> ... inputCols=inputs, outputCols=outputs)</span> |
| <span class="sd"> >>> result = fromlabelsModel.transform(multiRowDf)</span> |
| <span class="sd"> >>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,</span> |
| <span class="sd"> ... result.index2).collect()]), key=lambda x: x[0])</span> |
| <span class="sd"> [(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 1.0), (5, 2.0, 1.0)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"frequencyDesc"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span> |
| <span class="sd"> handleInvalid="error", stringOrderType="frequencyDesc")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">StringIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.StringIndexer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="StringIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="n">stringOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"frequencyDesc"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \</span> |
| <span class="sd"> handleInvalid="error", stringOrderType="frequencyDesc")</span> |
| <span class="sd"> Sets params for this StringIndexer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">StringIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="StringIndexer.setStringOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setStringOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStringOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`stringOrderType`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexer.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexer.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexer.html#pyspark.ml.feature.StringIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="StringIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">StringIndexerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_StringIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"StringIndexerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`StringIndexer`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="StringIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.setHandleInvalid">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.from_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_labels">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">from_labels</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Construct the model directly from an array of label strings,</span> |
| <span class="sd"> requires an active SparkContext.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> |
| <span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span> |
| <span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">labels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.StringIndexerModel"</span><span class="p">,</span> <span class="n">jlabels</span> |
| <span class="p">)</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setInputCol</span><span class="p">(</span><span class="n">inputCol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">outputCol</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setOutputCol</span><span class="p">(</span><span class="n">outputCol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">model</span></div> |
| |
| <div class="viewcode-block" id="StringIndexerModel.from_arrays_of_labels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StringIndexerModel.html#pyspark.ml.feature.StringIndexerModel.from_arrays_of_labels">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">from_arrays_of_labels</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">arrayOfLabels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StringIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Construct the model directly from an array of array of label strings,</span> |
| <span class="sd"> requires an active SparkContext.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> |
| <span class="k">assert</span> <span class="n">sc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">java_class</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_gateway</span><span class="o">.</span><span class="n">jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">String</span> |
| <span class="n">jlabels</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_new_java_array</span><span class="p">(</span><span class="n">arrayOfLabels</span><span class="p">,</span> <span class="n">java_class</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">StringIndexerModel</span><span class="o">.</span><span class="n">_create_from_java_class</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.StringIndexerModel"</span><span class="p">,</span> <span class="n">jlabels</span> |
| <span class="p">)</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setInputCols</span><span class="p">(</span><span class="n">inputCols</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">outputCols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setOutputCols</span><span class="p">(</span><span class="n">outputCols</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">handleInvalid</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">setHandleInvalid</span><span class="p">(</span><span class="n">handleInvalid</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">model</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">labels</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Ordered list of labels, corresponding to indices to be assigned.</span> |
| |
| <span class="sd"> .. deprecated:: 3.1.0</span> |
| <span class="sd"> It will be removed in future versions. Use `labelsArray` method instead.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"labels"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.2"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">labelsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Array of ordered list of labels, corresponding to indices to be assigned</span> |
| <span class="sd"> for each input column.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"labelsArray"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="IndexToString"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">IndexToString</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"IndexToString"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A :py:class:`pyspark.ml.base.Transformer` that maps a column of indices back to a new column of</span> |
| <span class="sd"> corresponding string values.</span> |
| <span class="sd"> The index-string mapping is either from the ML attributes of the input column,</span> |
| <span class="sd"> or from user-supplied labels (which take precedence over ML attributes).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> StringIndexer : for converting categorical values into category indices</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"labels"</span><span class="p">,</span> |
| <span class="s2">"Optional array of labels specifying index-string mapping."</span> |
| <span class="o">+</span> <span class="s2">" If not provided or if empty, then metadata from inputCol is used instead."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, labels=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">IndexToString</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.IndexToString"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="IndexToString.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"IndexToString"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, labels=None)</span> |
| <span class="sd"> Sets params for this IndexToString.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexToString.setLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"IndexToString"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`labels`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labels</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexToString.getLabels"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.getLabels">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLabels</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`labels` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexToString.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IndexToString"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexToString.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.IndexToString.html#pyspark.ml.feature.IndexToString.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"IndexToString"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="StopWordsRemover"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover">[docs]</a><span class="k">class</span> <span class="nc">StopWordsRemover</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCols</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"StopWordsRemover"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A feature transformer that filters out stop words from input.</span> |
| <span class="sd"> Since 3.0.0, :py:class:`StopWordsRemover` can filter out multiple columns at once by setting</span> |
| <span class="sd"> the :py:attr:`inputCols` parameter. Note that when both the :py:attr:`inputCol` and</span> |
| <span class="sd"> :py:attr:`inputCols` parameters are set, an Exception will be thrown.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> null values from input array are preserved unless adding null to stopWords explicitly.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])</span> |
| <span class="sd"> >>> remover = StopWordsRemover(stopWords=["b"])</span> |
| <span class="sd"> >>> remover.setInputCol("text")</span> |
| <span class="sd"> StopWordsRemover...</span> |
| <span class="sd"> >>> remover.setOutputCol("words")</span> |
| <span class="sd"> StopWordsRemover...</span> |
| <span class="sd"> >>> remover.transform(df).head().words == ['a', 'c']</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> stopWordsRemoverPath = temp_path + "/stopwords-remover"</span> |
| <span class="sd"> >>> remover.save(stopWordsRemoverPath)</span> |
| <span class="sd"> >>> loadedRemover = StopWordsRemover.load(stopWordsRemoverPath)</span> |
| <span class="sd"> >>> loadedRemover.getStopWords() == remover.getStopWords()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedRemover.transform(df).take(1) == remover.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(["a", "b", "c"], ["a", "b"])], ["text1", "text2"])</span> |
| <span class="sd"> >>> remover2 = StopWordsRemover(stopWords=["b"])</span> |
| <span class="sd"> >>> remover2.setInputCols(["text1", "text2"]).setOutputCols(["words1", "words2"])</span> |
| <span class="sd"> StopWordsRemover...</span> |
| <span class="sd"> >>> remover2.transform(df2).show()</span> |
| <span class="sd"> +---------+------+------+------+</span> |
| <span class="sd"> | text1| text2|words1|words2|</span> |
| <span class="sd"> +---------+------+------+------+</span> |
| <span class="sd"> |[a, b, c]|[a, b]|[a, c]| [a]|</span> |
| <span class="sd"> +---------+------+------+------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"stopWords"</span><span class="p">,</span> |
| <span class="s2">"The words to be filtered out"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"caseSensitive"</span><span class="p">,</span> |
| <span class="s2">"whether to do a case sensitive "</span> <span class="o">+</span> <span class="s2">"comparison over the stop words"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"locale"</span><span class="p">,</span> |
| <span class="s2">"locale of the input. ignored when case sensitive "</span> <span class="o">+</span> <span class="s2">"is true"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span> |
| <span class="sd"> locale=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">StopWordsRemover</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.StopWordsRemover"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">stopWords</span><span class="o">=</span><span class="n">StopWordsRemover</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="s2">"english"</span><span class="p">),</span> |
| <span class="n">caseSensitive</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">locale</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">getLocale</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">stopWords</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">caseSensitive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">locale</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \</span> |
| <span class="sd"> locale=None, inputCols=None, outputCols=None)</span> |
| <span class="sd"> Sets params for this StopWordRemover.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`stopWords`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stopWords</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.getStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getStopWords">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getStopWords</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`stopWords` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stopWords</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`caseSensitive`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">caseSensitive</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.getCaseSensitive"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getCaseSensitive">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getCaseSensitive</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`caseSensitive` or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">caseSensitive</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`locale`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">locale</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.getLocale"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.getLocale">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLocale</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`locale`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">locale</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setInputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.setOutputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.setOutputCols">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"StopWordsRemover"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="StopWordsRemover.loadDefaultStopWords"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.StopWordsRemover.html#pyspark.ml.feature.StopWordsRemover.loadDefaultStopWords">[docs]</a> <span class="nd">@staticmethod</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Loads the default stop words for the given language.</span> |
| <span class="sd"> Supported languages: danish, dutch, english, finnish, french, german, hungarian,</span> |
| <span class="sd"> italian, norwegian, portuguese, russian, spanish, swedish, turkish</span> |
| <span class="sd"> """</span> |
| <span class="n">stopWordsObj</span> <span class="o">=</span> <span class="n">_jvm</span><span class="p">()</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="n">StopWordsRemover</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">stopWordsObj</span><span class="o">.</span><span class="n">loadDefaultStopWords</span><span class="p">(</span><span class="n">language</span><span class="p">))</span></div></div> |
| |
| |
| <div class="viewcode-block" id="Tokenizer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Tokenizer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Tokenizer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A tokenizer that converts the input string to lowercase and then</span> |
| <span class="sd"> splits it by white spaces.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a b c",)], ["text"])</span> |
| <span class="sd"> >>> tokenizer = Tokenizer(outputCol="words")</span> |
| <span class="sd"> >>> tokenizer.setInputCol("text")</span> |
| <span class="sd"> Tokenizer...</span> |
| <span class="sd"> >>> tokenizer.transform(df).head()</span> |
| <span class="sd"> Row(text='a b c', words=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Change a parameter.</span> |
| <span class="sd"> >>> tokenizer.setParams(outputCol="tokens").transform(df).head()</span> |
| <span class="sd"> Row(text='a b c', tokens=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Temporarily modify a parameter.</span> |
| <span class="sd"> >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()</span> |
| <span class="sd"> Row(text='a b c', words=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> tokenizer.transform(df).head()</span> |
| <span class="sd"> Row(text='a b c', tokens=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> # Must use keyword arguments to specify params.</span> |
| <span class="sd"> >>> tokenizer.setParams("text")</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> TypeError: Method setParams forces keyword arguments.</span> |
| <span class="sd"> >>> tokenizerPath = temp_path + "/tokenizer"</span> |
| <span class="sd"> >>> tokenizer.save(tokenizerPath)</span> |
| <span class="sd"> >>> loadedTokenizer = Tokenizer.load(tokenizerPath)</span> |
| <span class="sd"> >>> loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Tokenizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Tokenizer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Tokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Sets params for this Tokenizer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Tokenizer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Tokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Tokenizer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Tokenizer.html#pyspark.ml.feature.Tokenizer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Tokenizer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="VectorAssembler"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">VectorAssembler</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCols</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VectorAssembler"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A feature transformer that merges multiple columns into a vector column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> vecAssembler = VectorAssembler(outputCol="features")</span> |
| <span class="sd"> >>> vecAssembler.setInputCols(["a", "b", "c"])</span> |
| <span class="sd"> VectorAssembler...</span> |
| <span class="sd"> >>> vecAssembler.transform(df).head().features</span> |
| <span class="sd"> DenseVector([1.0, 0.0, 3.0])</span> |
| <span class="sd"> >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs</span> |
| <span class="sd"> DenseVector([1.0, 0.0, 3.0])</span> |
| <span class="sd"> >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}</span> |
| <span class="sd"> >>> vecAssembler.transform(df, params).head().vector</span> |
| <span class="sd"> DenseVector([0.0, 1.0])</span> |
| <span class="sd"> >>> vectorAssemblerPath = temp_path + "/vector-assembler"</span> |
| <span class="sd"> >>> vecAssembler.save(vectorAssemblerPath)</span> |
| <span class="sd"> >>> loadedAssembler = VectorAssembler.load(vectorAssemblerPath)</span> |
| <span class="sd"> >>> loadedAssembler.transform(df).head().freqs == vecAssembler.transform(df).head().freqs</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> dfWithNullsAndNaNs = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1.0, 2.0, None), (3.0, float("nan"), 4.0), (5.0, 6.0, 7.0)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> vecAssembler2 = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features",</span> |
| <span class="sd"> ... handleInvalid="keep")</span> |
| <span class="sd"> >>> vecAssembler2.transform(dfWithNullsAndNaNs).show()</span> |
| <span class="sd"> +---+---+----+-------------+</span> |
| <span class="sd"> | a| b| c| features|</span> |
| <span class="sd"> +---+---+----+-------------+</span> |
| <span class="sd"> |1.0|2.0|NULL|[1.0,2.0,NaN]|</span> |
| <span class="sd"> |3.0|NaN| 4.0|[3.0,NaN,4.0]|</span> |
| <span class="sd"> |5.0|6.0| 7.0|[5.0,6.0,7.0]|</span> |
| <span class="sd"> +---+---+----+-------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> vecAssembler2.setParams(handleInvalid="skip").transform(dfWithNullsAndNaNs).show()</span> |
| <span class="sd"> +---+---+---+-------------+</span> |
| <span class="sd"> | a| b| c| features|</span> |
| <span class="sd"> +---+---+---+-------------+</span> |
| <span class="sd"> |5.0|6.0|7.0|[5.0,6.0,7.0]|</span> |
| <span class="sd"> +---+---+---+-------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"How to handle invalid data (NULL "</span> |
| <span class="o">+</span> <span class="s2">"and NaN values). Options are 'skip' (filter out rows with invalid "</span> |
| <span class="o">+</span> <span class="s2">"data), 'error' (throw an error), or 'keep' (return relevant number "</span> |
| <span class="o">+</span> <span class="s2">"of NaN in the output). Column lengths are taken from the size of ML "</span> |
| <span class="o">+</span> <span class="s2">"Attribute Group, which can be set using `VectorSizeHint` in a "</span> |
| <span class="o">+</span> <span class="s2">"pipeline before `VectorAssembler`. Column lengths can also be "</span> |
| <span class="o">+</span> <span class="s2">"inferred from first rows of the data since it is safe to do so but "</span> |
| <span class="o">+</span> <span class="s2">"only in case of 'error' or 'skip')."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCols=None, outputCol=None, handleInvalid="error")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">VectorAssembler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.VectorAssembler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="VectorAssembler.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorAssembler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCols=None, outputCol=None, handleInvalid="error")</span> |
| <span class="sd"> Sets params for this VectorAssembler.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorAssembler.setInputCols"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setInputCols">[docs]</a> <span class="k">def</span> <span class="nf">setInputCols</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"VectorAssembler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCols`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCols</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorAssembler.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorAssembler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorAssembler.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorAssembler.html#pyspark.ml.feature.VectorAssembler.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorAssembler"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_VectorIndexerParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`VectorIndexer` and :py:class:`VectorIndexerModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">maxCategories</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"maxCategories"</span><span class="p">,</span> |
| <span class="s2">"Threshold for the number of values a categorical feature can take "</span> |
| <span class="o">+</span> <span class="s2">"(>= 2). If a feature is found to have > maxCategories values, then "</span> |
| <span class="o">+</span> <span class="s2">"it is declared continuous."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"How to handle invalid data "</span> |
| <span class="o">+</span> <span class="s2">"(unseen labels or NULL values). Options are 'skip' (filter out "</span> |
| <span class="o">+</span> <span class="s2">"rows with invalid data), 'error' (throw an error), or 'keep' (put "</span> |
| <span class="o">+</span> <span class="s2">"invalid data in a special additional bucket, at index of the number "</span> |
| <span class="o">+</span> <span class="s2">"of categories of the feature)."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of maxCategories or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="VectorIndexer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">VectorIndexer</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"VectorIndexerModel"</span><span class="p">],</span> |
| <span class="n">_VectorIndexerParams</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VectorIndexer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Class for indexing categorical feature columns in a dataset of `Vector`.</span> |
| |
| <span class="sd"> This has 2 usage modes:</span> |
| <span class="sd"> - Automatically identify categorical features (default behavior)</span> |
| <span class="sd"> - This helps process a dataset of unknown vectors into a dataset with some continuous</span> |
| <span class="sd"> features and some categorical features. The choice between continuous and categorical</span> |
| <span class="sd"> is based upon a maxCategories parameter.</span> |
| <span class="sd"> - Set maxCategories to the maximum number of categorical any categorical feature should</span> |
| <span class="sd"> have.</span> |
| <span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span> |
| <span class="sd"> If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},</span> |
| <span class="sd"> and feature 1 will be declared continuous.</span> |
| <span class="sd"> - Index all features, if all features are categorical</span> |
| <span class="sd"> - If maxCategories is set to be very large, then this will build an index of unique</span> |
| <span class="sd"> values for all features.</span> |
| <span class="sd"> - Warning: This can cause problems if features are continuous since this will collect ALL</span> |
| <span class="sd"> unique values to the driver.</span> |
| <span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span> |
| <span class="sd"> If maxCategories >= 3, then both features will be declared categorical.</span> |
| |
| <span class="sd"> This returns a model which can transform categorical features to use 0-based indices.</span> |
| |
| <span class="sd"> Index stability:</span> |
| <span class="sd"> - This is not guaranteed to choose the same category index across multiple runs.</span> |
| <span class="sd"> - If a categorical feature includes value 0, then this is guaranteed to map value 0 to</span> |
| <span class="sd"> index 0. This maintains vector sparsity.</span> |
| <span class="sd"> - More stability may be added in the future.</span> |
| |
| <span class="sd"> TODO: Future extensions: The following functionality is planned for the future:</span> |
| <span class="sd"> - Preserve metadata in transform; if a feature's metadata is already present,</span> |
| <span class="sd"> do not recompute.</span> |
| <span class="sd"> - Specify certain features to not index, either via a parameter or via existing metadata.</span> |
| <span class="sd"> - Add warning if a categorical feature has only 1 category.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])</span> |
| <span class="sd"> >>> indexer = VectorIndexer(maxCategories=2, inputCol="a")</span> |
| <span class="sd"> >>> indexer.setOutputCol("indexed")</span> |
| <span class="sd"> VectorIndexer...</span> |
| <span class="sd"> >>> model = indexer.fit(df)</span> |
| <span class="sd"> >>> indexer.getHandleInvalid()</span> |
| <span class="sd"> 'error'</span> |
| <span class="sd"> >>> model.setOutputCol("output")</span> |
| <span class="sd"> VectorIndexerModel...</span> |
| <span class="sd"> >>> model.transform(df).head().output</span> |
| <span class="sd"> DenseVector([1.0, 0.0])</span> |
| <span class="sd"> >>> model.numFeatures</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.categoryMaps</span> |
| <span class="sd"> {0: {0.0: 0, -1.0: 1}}</span> |
| <span class="sd"> >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test</span> |
| <span class="sd"> DenseVector([0.0, 1.0])</span> |
| <span class="sd"> >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}</span> |
| <span class="sd"> >>> model2 = indexer.fit(df, params)</span> |
| <span class="sd"> >>> model2.transform(df).head().vector</span> |
| <span class="sd"> DenseVector([1.0, 0.0])</span> |
| <span class="sd"> >>> vectorIndexerPath = temp_path + "/vector-indexer"</span> |
| <span class="sd"> >>> indexer.save(vectorIndexerPath)</span> |
| <span class="sd"> >>> loadedIndexer = VectorIndexer.load(vectorIndexerPath)</span> |
| <span class="sd"> >>> loadedIndexer.getMaxCategories() == indexer.getMaxCategories()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/vector-indexer-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = VectorIndexerModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.numFeatures == model.numFeatures</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.categoryMaps == model.categoryMaps</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], ["a"])</span> |
| <span class="sd"> >>> indexer.getHandleInvalid()</span> |
| <span class="sd"> 'error'</span> |
| <span class="sd"> >>> model3 = indexer.setHandleInvalid("skip").fit(df)</span> |
| <span class="sd"> >>> model3.transform(dfWithInvalid).count()</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> >>> model4 = indexer.setParams(handleInvalid="keep", outputCol="indexed").fit(df)</span> |
| <span class="sd"> >>> model4.transform(dfWithInvalid).head().indexed</span> |
| <span class="sd"> DenseVector([2.0, 1.0])</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">maxCategories</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">VectorIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.VectorIndexer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="VectorIndexer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">maxCategories</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error")</span> |
| <span class="sd"> Sets params for this VectorIndexer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorIndexer.setMaxCategories"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setMaxCategories">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxCategories`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorIndexer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorIndexer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorIndexer.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexer.html#pyspark.ml.feature.VectorIndexer.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexerModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">VectorIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="VectorIndexerModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">VectorIndexerModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> <span class="n">_VectorIndexerParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VectorIndexerModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`VectorIndexer`.</span> |
| |
| <span class="sd"> Transform categorical features to use 0-based indices instead of their original values.</span> |
| <span class="sd"> - Categorical features are mapped to indices.</span> |
| <span class="sd"> - Continuous features (columns) are left unchanged.</span> |
| |
| <span class="sd"> This also appends metadata to the output column, marking features as Numeric (continuous),</span> |
| <span class="sd"> Nominal (categorical), or Binary (either continuous or categorical).</span> |
| <span class="sd"> Non-ML metadata is not carried over from the input to the output column.</span> |
| |
| <span class="sd"> This maintains vector sparsity.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="VectorIndexerModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorIndexerModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorIndexerModel.html#pyspark.ml.feature.VectorIndexerModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorIndexerModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">numFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Number of features, i.e., length of Vectors which this transforms.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"numFeatures"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">categoryMaps</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Feature value index. Keys are categorical feature indices (column indices).</span> |
| <span class="sd"> Values are maps from original features values to 0-based category indices.</span> |
| <span class="sd"> If a feature is not in this map, it is treated as continuous.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"javaCategoryMaps"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="VectorSlicer"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">VectorSlicer</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasOutputCol</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VectorSlicer"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> This class takes a feature vector and outputs a new feature vector with a subarray</span> |
| <span class="sd"> of the original features.</span> |
| |
| <span class="sd"> The subset of features can be specified with either indices (`setIndices()`)</span> |
| <span class="sd"> or names (`setNames()`). At least one feature must be selected. Duplicate features</span> |
| <span class="sd"> are not allowed, so there can be no overlap between selected indices and names.</span> |
| |
| <span class="sd"> The output vector will order features with the selected indices first (in the order given),</span> |
| <span class="sd"> followed by the selected names (in the order given).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])</span> |
| <span class="sd"> >>> vs = VectorSlicer(outputCol="sliced", indices=[1, 4])</span> |
| <span class="sd"> >>> vs.setInputCol("features")</span> |
| <span class="sd"> VectorSlicer...</span> |
| <span class="sd"> >>> vs.transform(df).head().sliced</span> |
| <span class="sd"> DenseVector([2.3, 1.0])</span> |
| <span class="sd"> >>> vectorSlicerPath = temp_path + "/vector-slicer"</span> |
| <span class="sd"> >>> vs.save(vectorSlicerPath)</span> |
| <span class="sd"> >>> loadedVs = VectorSlicer.load(vectorSlicerPath)</span> |
| <span class="sd"> >>> loadedVs.getIndices() == vs.getIndices()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedVs.getNames() == vs.getNames()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedVs.transform(df).take(1) == vs.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">indices</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"indices"</span><span class="p">,</span> |
| <span class="s2">"An array of indices to select features from "</span> |
| <span class="o">+</span> <span class="s2">"a vector column. There can be no overlap with names."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"names"</span><span class="p">,</span> |
| <span class="s2">"An array of feature names to select features from "</span> |
| <span class="o">+</span> <span class="s2">"a vector column. These names must be specified by ML "</span> |
| <span class="o">+</span> <span class="s2">"org.apache.spark.ml.attribute.Attribute. There can be no overlap with "</span> |
| <span class="o">+</span> <span class="s2">"indices."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toListString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">indices</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, outputCol=None, indices=None, names=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">VectorSlicer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.VectorSlicer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="p">[],</span> <span class="n">names</span><span class="o">=</span><span class="p">[])</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="VectorSlicer.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">indices</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSlicer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, outputCol=None, indices=None, names=None):</span> |
| <span class="sd"> Sets params for this VectorSlicer.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.setIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"VectorSlicer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`indices`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">indices</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.getIndices"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getIndices">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getIndices</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of indices or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">indices</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.setNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setNames</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"VectorSlicer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`names`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">names</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.getNames"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.getNames">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.6.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getNames</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of names or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">names</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSlicer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSlicer.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSlicer.html#pyspark.ml.feature.VectorSlicer.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSlicer"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_Word2VecParams</span><span class="p">(</span><span class="n">HasStepSize</span><span class="p">,</span> <span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`Word2Vec` and :py:class:`Word2VecModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">vectorSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"vectorSize"</span><span class="p">,</span> |
| <span class="s2">"the dimension of codes after transforming from words"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"numPartitions"</span><span class="p">,</span> |
| <span class="s2">"number of partitions for sentences of words"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">minCount</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"minCount"</span><span class="p">,</span> |
| <span class="s2">"the minimum number of times a token must appear to be included in the "</span> |
| <span class="o">+</span> <span class="s2">"word2vec model's vocabulary"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">windowSize</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"windowSize"</span><span class="p">,</span> |
| <span class="s2">"the window size (context words from [-window, window]). Default value is 5"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">maxSentenceLength</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"maxSentenceLength"</span><span class="p">,</span> |
| <span class="s2">"Maximum length (in words) of each sentence in the input data. "</span> |
| <span class="o">+</span> <span class="s2">"Any sentence longer than this threshold will "</span> |
| <span class="o">+</span> <span class="s2">"be divided into chunks up to the size."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_Word2VecParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> |
| <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> |
| <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> |
| <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> |
| <span class="n">windowSize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> |
| <span class="n">maxSentenceLength</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of vectorSize or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of numPartitions or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of minCount or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minCount</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of windowSize or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">windowSize</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of maxSentenceLength or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxSentenceLength</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="Word2Vec"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">Word2Vec</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"Word2VecModel"</span><span class="p">],</span> |
| <span class="n">_Word2VecParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Word2Vec"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further</span> |
| <span class="sd"> natural language processing or machine learning process.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> sent = ("a b " * 100 + "a c " * 10).split(" ")</span> |
| <span class="sd"> >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])</span> |
| <span class="sd"> >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")</span> |
| <span class="sd"> >>> word2Vec.setMaxIter(10)</span> |
| <span class="sd"> Word2Vec...</span> |
| <span class="sd"> >>> word2Vec.getMaxIter()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> word2Vec.clear(word2Vec.maxIter)</span> |
| <span class="sd"> >>> model = word2Vec.fit(doc)</span> |
| <span class="sd"> >>> model.getMinCount()</span> |
| <span class="sd"> 5</span> |
| <span class="sd"> >>> model.setInputCol("sentence")</span> |
| <span class="sd"> Word2VecModel...</span> |
| <span class="sd"> >>> model.getVectors().show()</span> |
| <span class="sd"> +----+--------------------+</span> |
| <span class="sd"> |word| vector|</span> |
| <span class="sd"> +----+--------------------+</span> |
| <span class="sd"> | a|[0.0951...</span> |
| <span class="sd"> | b|[-1.202...</span> |
| <span class="sd"> | c|[0.3015...</span> |
| <span class="sd"> +----+--------------------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model.findSynonymsArray("a", 2)</span> |
| <span class="sd"> [('b', 0.015859...), ('c', -0.568079...)]</span> |
| <span class="sd"> >>> from pyspark.sql.functions import format_number as fmt</span> |
| <span class="sd"> >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()</span> |
| <span class="sd"> +----+----------+</span> |
| <span class="sd"> |word|similarity|</span> |
| <span class="sd"> +----+----------+</span> |
| <span class="sd"> | b| 0.01586|</span> |
| <span class="sd"> | c| -0.56808|</span> |
| <span class="sd"> +----+----------+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> model.transform(doc).head().model</span> |
| <span class="sd"> DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769])</span> |
| <span class="sd"> >>> word2vecPath = temp_path + "/word2vec"</span> |
| <span class="sd"> >>> word2Vec.save(word2vecPath)</span> |
| <span class="sd"> >>> loadedWord2Vec = Word2Vec.load(word2vecPath)</span> |
| <span class="sd"> >>> loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedWord2Vec.getMinCount() == word2Vec.getMinCount()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/word2vec-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = Word2VecModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.getVectors().first().word == model.getVectors().first().word</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.getVectors().first().vector == model.getVectors().first().vector</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(doc).take(1) == model.transform(doc).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">vectorSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">minCount</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">stepSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.025</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">windowSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="n">maxSentenceLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, \</span> |
| <span class="sd"> maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, \</span> |
| <span class="sd"> maxSentenceLength=1000)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">Word2Vec</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.Word2Vec"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Word2Vec.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">vectorSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> |
| <span class="n">minCount</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">stepSize</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.025</span><span class="p">,</span> |
| <span class="n">maxIter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">windowSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="n">maxSentenceLength</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \</span> |
| <span class="sd"> seed=None, inputCol=None, outputCol=None, windowSize=5, \</span> |
| <span class="sd"> maxSentenceLength=1000)</span> |
| <span class="sd"> Sets params for this Word2Vec.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setVectorSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setVectorSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`vectorSize`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setNumPartitions"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setNumPartitions">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numPartitions`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numPartitions</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setMinCount"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMinCount">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`minCount`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">minCount</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setWindowSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setWindowSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setWindowSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`windowSize`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">windowSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setMaxSentenceLength"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxSentenceLength">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setMaxSentenceLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxSentenceLength`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxSentenceLength</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setMaxIter"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setMaxIter">[docs]</a> <span class="k">def</span> <span class="nf">setMaxIter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`maxIter`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setSeed"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setSeed">[docs]</a> <span class="k">def</span> <span class="nf">setSeed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`seed`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2Vec.setStepSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec.setStepSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.4.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStepSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2Vec"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`stepSize`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stepSize</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2VecModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">Word2VecModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="Word2VecModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel">[docs]</a><span class="k">class</span> <span class="nc">Word2VecModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_Word2VecParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"Word2VecModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`Word2Vec`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="Word2VecModel.getVectors"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.getVectors">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getVectors</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the vector representation of the words as a dataframe</span> |
| <span class="sd"> with two fields, word and vector.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"getVectors"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2VecModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2VecModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2VecModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Word2VecModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2VecModel.findSynonyms"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonyms">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">findSynonyms</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Vector</span><span class="p">],</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Find "num" number of words closest in similarity to "word".</span> |
| <span class="sd"> word can be a string or vector representation.</span> |
| <span class="sd"> Returns a dataframe with two fields word and similarity (which</span> |
| <span class="sd"> gives the cosine similarity).</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"findSynonyms"</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Word2VecModel.findSynonymsArray"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.Word2VecModel.html#pyspark.ml.feature.Word2VecModel.findSynonymsArray">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">findSynonymsArray</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Vector</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Find "num" number of words closest in similarity to "word".</span> |
| <span class="sd"> word can be a string or vector representation.</span> |
| <span class="sd"> Returns an array with two fields word and similarity (which</span> |
| <span class="sd"> gives the cosine similarity).</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">tuples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span><span class="o">.</span><span class="n">findSynonymsArray</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">st</span><span class="p">:</span> <span class="p">(</span><span class="n">st</span><span class="o">.</span><span class="n">_1</span><span class="p">(),</span> <span class="n">st</span><span class="o">.</span><span class="n">_2</span><span class="p">()),</span> <span class="nb">list</span><span class="p">(</span><span class="n">tuples</span><span class="p">)))</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_PCAParams</span><span class="p">(</span><span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`PCA` and :py:class:`PCAModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">k</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"k"</span><span class="p">,</span> |
| <span class="s2">"the number of principal components"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of k or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="PCA"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">PCA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"PCAModel"</span><span class="p">],</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"PCA"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> PCA trains a model to project vectors to a lower dimensional space of the</span> |
| <span class="sd"> top :py:attr:`k` principal components.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),</span> |
| <span class="sd"> ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data,["features"])</span> |
| <span class="sd"> >>> pca = PCA(k=2, inputCol="features")</span> |
| <span class="sd"> >>> pca.setOutputCol("pca_features")</span> |
| <span class="sd"> PCA...</span> |
| <span class="sd"> >>> model = pca.fit(df)</span> |
| <span class="sd"> >>> model.getK()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> model.setOutputCol("output")</span> |
| <span class="sd"> PCAModel...</span> |
| <span class="sd"> >>> model.transform(df).collect()[0].output</span> |
| <span class="sd"> DenseVector([1.648..., -4.013...])</span> |
| <span class="sd"> >>> model.explainedVariance</span> |
| <span class="sd"> DenseVector([0.794..., 0.205...])</span> |
| <span class="sd"> >>> pcaPath = temp_path + "/pca"</span> |
| <span class="sd"> >>> pca.save(pcaPath)</span> |
| <span class="sd"> >>> loadedPca = PCA.load(pcaPath)</span> |
| <span class="sd"> >>> loadedPca.getK() == pca.getK()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/pca-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = PCAModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.pc == model.pc</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.explainedVariance == model.explainedVariance</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, k=None, inputCol=None, outputCol=None)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">PCA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.PCA"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="PCA.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"PCA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, k=None, inputCol=None, outputCol=None)</span> |
| <span class="sd"> Set params for this PCA.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PCA.setK"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setK">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`k`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PCA.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PCA.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCA"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCAModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">PCAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="PCAModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel">[docs]</a><span class="k">class</span> <span class="nc">PCAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_PCAParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"PCAModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="PCAModel.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setInputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCAModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="PCAModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.PCAModel.html#pyspark.ml.feature.PCAModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PCAModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">pc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DenseMatrix</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a principal components Matrix.</span> |
| <span class="sd"> Each column is one principal component.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"pc"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">explainedVariance</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DenseVector</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a vector of proportions of variance</span> |
| <span class="sd"> explained by each principal component.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"explainedVariance"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_RFormulaParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">,</span> <span class="n">HasHandleInvalid</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`RFormula` and :py:class:`RFormula`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">formula</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">"formula"</span><span class="p">,</span> <span class="s2">"R model formula"</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span> |
| <span class="p">)</span> |
| |
| <span class="n">forceIndexLabel</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"forceIndexLabel"</span><span class="p">,</span> |
| <span class="s2">"Force to index label whether it is numeric or string"</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toBoolean</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"stringIndexerOrderType"</span><span class="p">,</span> |
| <span class="s2">"How to order categories of a string feature column used by "</span> |
| <span class="o">+</span> <span class="s2">"StringIndexer. The last category after ordering is dropped "</span> |
| <span class="o">+</span> <span class="s2">"when encoding strings. Supported options: frequencyDesc, "</span> |
| <span class="o">+</span> <span class="s2">"frequencyAsc, alphabetDesc, alphabetAsc. The default value "</span> |
| <span class="o">+</span> <span class="s2">"is frequencyDesc. When the ordering is set to alphabetDesc, "</span> |
| <span class="o">+</span> <span class="s2">"RFormula drops the same category as R when encoding strings."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"how to handle invalid entries. "</span> |
| <span class="o">+</span> <span class="s2">"Options are 'skip' (filter out rows with invalid values), "</span> |
| <span class="o">+</span> <span class="s2">"'error' (throw an error), or 'keep' (put invalid data in a special "</span> |
| <span class="o">+</span> <span class="s2">"additional bucket, at index numLabels)."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_RFormulaParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">forceIndexLabel</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="s2">"frequencyDesc"</span><span class="p">,</span> <span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`formula`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`forceIndexLabel`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">forceIndexLabel</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of :py:attr:`stringIndexerOrderType` or its default value 'frequencyDesc'.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stringIndexerOrderType</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="RFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">RFormula</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"RFormulaModel"</span><span class="p">],</span> |
| <span class="n">_RFormulaParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"RFormula"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Implements the transforms required for fitting a dataset against an</span> |
| <span class="sd"> R model formula. Currently we support a limited subset of the R</span> |
| <span class="sd"> operators, including '~', '.', ':', '+', '-', '*', and '^'.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Also see the `R formula docs</span> |
| <span class="sd"> <http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html>`_.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (1.0, 1.0, "a"),</span> |
| <span class="sd"> ... (0.0, 2.0, "b"),</span> |
| <span class="sd"> ... (0.0, 0.0, "a")</span> |
| <span class="sd"> ... ], ["y", "x", "s"])</span> |
| <span class="sd"> >>> rf = RFormula(formula="y ~ x + s")</span> |
| <span class="sd"> >>> model = rf.fit(df)</span> |
| <span class="sd"> >>> model.getLabelCol()</span> |
| <span class="sd"> 'label'</span> |
| <span class="sd"> >>> model.transform(df).show()</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> | y| x| s| features|label|</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span> |
| <span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span> |
| <span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> rf.fit(df, {rf.formula: "y ~ . - s"}).transform(df).show()</span> |
| <span class="sd"> +---+---+---+--------+-----+</span> |
| <span class="sd"> | y| x| s|features|label|</span> |
| <span class="sd"> +---+---+---+--------+-----+</span> |
| <span class="sd"> |1.0|1.0| a| [1.0]| 1.0|</span> |
| <span class="sd"> |0.0|2.0| b| [2.0]| 0.0|</span> |
| <span class="sd"> |0.0|0.0| a| [0.0]| 0.0|</span> |
| <span class="sd"> +---+---+---+--------+-----+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> rFormulaPath = temp_path + "/rFormula"</span> |
| <span class="sd"> >>> rf.save(rFormulaPath)</span> |
| <span class="sd"> >>> loadedRF = RFormula.load(rFormulaPath)</span> |
| <span class="sd"> >>> loadedRF.getFormula() == rf.getFormula()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedRF.getFeaturesCol() == rf.getFeaturesCol()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedRF.getLabelCol() == rf.getLabelCol()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedRF.getHandleInvalid() == rf.getHandleInvalid()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> str(loadedRF)</span> |
| <span class="sd"> 'RFormula(y ~ x + s) (uid=...)'</span> |
| <span class="sd"> >>> modelPath = temp_path + "/rFormulaModel"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = RFormulaModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.uid == model.uid</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).show()</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> | y| x| s| features|label|</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span> |
| <span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span> |
| <span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span> |
| <span class="sd"> +---+---+---+---------+-----+</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> str(loadedModel)</span> |
| <span class="sd"> 'RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)'</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">formula</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">forceIndexLabel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"frequencyDesc"</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, formula=None, featuresCol="features", labelCol="label", \</span> |
| <span class="sd"> forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \</span> |
| <span class="sd"> handleInvalid="error")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">RFormula</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.RFormula"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="RFormula.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">formula</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">forceIndexLabel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">stringIndexerOrderType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"frequencyDesc"</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, formula=None, featuresCol="features", labelCol="label", \</span> |
| <span class="sd"> forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \</span> |
| <span class="sd"> handleInvalid="error")</span> |
| <span class="sd"> Sets params for RFormula.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setFormula"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFormula">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"1.5.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`formula`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">formula</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setForceIndexLabel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setForceIndexLabel">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setForceIndexLabel</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`forceIndexLabel`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">forceIndexLabel</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setStringIndexerOrderType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setStringIndexerOrderType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setStringIndexerOrderType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`stringIndexerOrderType`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">stringIndexerOrderType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`labelCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RFormula.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormula.html#pyspark.ml.feature.RFormula.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormula"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RFormulaModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">RFormulaModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="n">formulaStr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getFormula</span><span class="p">()</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">isDefined</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span> <span class="k">else</span> <span class="s2">""</span> |
| <span class="k">return</span> <span class="s2">"RFormula(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)"</span> <span class="o">%</span> <span class="p">(</span><span class="n">formulaStr</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="RFormulaModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.RFormulaModel.html#pyspark.ml.feature.RFormulaModel">[docs]</a><span class="k">class</span> <span class="nc">RFormulaModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_RFormulaParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"RFormulaModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`RFormula`. Fitting is required to determine the</span> |
| <span class="sd"> factor levels of formula terms.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="n">resolvedFormula</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"resolvedFormula"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="s2">"RFormulaModel(</span><span class="si">%s</span><span class="s2">) (uid=</span><span class="si">%s</span><span class="s2">)"</span> <span class="o">%</span> <span class="p">(</span><span class="n">resolvedFormula</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_SelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`Selector` and :py:class:`SelectorModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">selectorType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"selectorType"</span><span class="p">,</span> |
| <span class="s2">"The selector type. "</span> |
| <span class="o">+</span> <span class="s2">"Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">numTopFeatures</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="s2">"Number of features that selector will select, ordered by ascending p-value. "</span> |
| <span class="o">+</span> <span class="s2">"If the number of features is < numTopFeatures, then this will select "</span> |
| <span class="o">+</span> <span class="s2">"all features."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">percentile</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"percentile"</span><span class="p">,</span> |
| <span class="s2">"Percentile of features that selector "</span> <span class="o">+</span> <span class="s2">"will select, ordered by ascending p-value."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">fpr</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"fpr"</span><span class="p">,</span> |
| <span class="s2">"The highest p-value for features to be kept."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">fdr</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"fdr"</span><span class="p">,</span> |
| <span class="s2">"The upper bound of the expected false discovery rate."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">fwe</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"fwe"</span><span class="p">,</span> |
| <span class="s2">"The upper bound of the expected family-wise error rate."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_SelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span> |
| <span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> |
| <span class="n">selectorType</span><span class="o">=</span><span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="n">percentile</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span> |
| <span class="n">fpr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fdr</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fwe</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of selectorType or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectorType</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of numTopFeatures or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numTopFeatures</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of percentile or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">percentile</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of fpr or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fpr</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of fdr or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fdr</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of fwe or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fwe</span><span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_Selector</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">[</span><span class="n">JM</span><span class="p">],</span> <span class="n">_SelectorParams</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">,</span> <span class="n">JavaMLWritable</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">JM</span><span class="p">]):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mixin for Selectors.</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSelectorType</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`selectorType`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectorType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setNumTopFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`numTopFeatures`.</span> |
| <span class="sd"> Only applicable when selectorType = "numTopFeatures".</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setPercentile</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`percentile`.</span> |
| <span class="sd"> Only applicable when selectorType = "percentile".</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">percentile</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFpr</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`fpr`.</span> |
| <span class="sd"> Only applicable when selectorType = "fpr".</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fpr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFdr</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`fdr`.</span> |
| <span class="sd"> Only applicable when selectorType = "fdr".</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fdr</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.2.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFwe</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`fwe`.</span> |
| <span class="sd"> Only applicable when selectorType = "fwe".</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">fwe</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`labelCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_SelectorModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">,</span> <span class="n">_SelectorParams</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Mixin for Selector models.</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">P</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">P</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> List of indices to select (filter).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"selectedFeatures"</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="ChiSqSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">ChiSqSelector</span><span class="p">(</span> |
| <span class="n">_Selector</span><span class="p">[</span><span class="s2">"ChiSqSelectorModel"</span><span class="p">],</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"ChiSqSelector"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Chi-Squared feature selection, which selects categorical features to use for predicting a</span> |
| <span class="sd"> categorical label.</span> |
| <span class="sd"> The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,</span> |
| <span class="sd"> `fdr`, `fwe`.</span> |
| |
| <span class="sd"> * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.</span> |
| |
| <span class="sd"> * `percentile` is similar but chooses a fraction of all features</span> |
| <span class="sd"> instead of a fixed number.</span> |
| |
| <span class="sd"> * `fpr` chooses all features whose p-values are below a threshold,</span> |
| <span class="sd"> thus controlling the false positive rate of selection.</span> |
| |
| <span class="sd"> * `fdr` uses the `Benjamini-Hochberg procedure <https://en.wikipedia.org/wiki/</span> |
| <span class="sd"> False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure>`_</span> |
| <span class="sd"> to choose all features whose false discovery rate is below a threshold.</span> |
| |
| <span class="sd"> * `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span> |
| <span class="sd"> 1/numFeatures, thus controlling the family-wise error rate of selection.</span> |
| |
| <span class="sd"> By default, the selection method is `numTopFeatures`, with the default number of top features</span> |
| <span class="sd"> set to 50.</span> |
| |
| <span class="sd"> .. deprecated:: 3.1.0</span> |
| <span class="sd"> Use UnivariateFeatureSelector</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),</span> |
| <span class="sd"> ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],</span> |
| <span class="sd"> ... ["features", "label"])</span> |
| <span class="sd"> >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")</span> |
| <span class="sd"> >>> model = selector.fit(df)</span> |
| <span class="sd"> >>> model.getFeaturesCol()</span> |
| <span class="sd"> 'features'</span> |
| <span class="sd"> >>> model.setFeaturesCol("features")</span> |
| <span class="sd"> ChiSqSelectorModel...</span> |
| <span class="sd"> >>> model.transform(df).head().selectedFeatures</span> |
| <span class="sd"> DenseVector([18.0])</span> |
| <span class="sd"> >>> model.selectedFeatures</span> |
| <span class="sd"> [2]</span> |
| <span class="sd"> >>> chiSqSelectorPath = temp_path + "/chi-sq-selector"</span> |
| <span class="sd"> >>> selector.save(chiSqSelectorPath)</span> |
| <span class="sd"> >>> loadedSelector = ChiSqSelector.load(chiSqSelectorPath)</span> |
| <span class="sd"> >>> loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/chi-sq-selector-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = ChiSqSelectorModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.selectedFeatures == model.selectedFeatures</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numTopFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">selectorType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="n">percentile</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span> |
| <span class="n">fpr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fdr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fwe</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, numTopFeatures=50, featuresCol="features", outputCol=None, \</span> |
| <span class="sd"> labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, \</span> |
| <span class="sd"> fdr=0.05, fwe=0.05)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">ChiSqSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.ChiSqSelector"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="ChiSqSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelector.html#pyspark.ml.feature.ChiSqSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.0.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">numTopFeatures</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">selectorType</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="n">percentile</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span> |
| <span class="n">fpr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fdr</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="n">fwe</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"ChiSqSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, numTopFeatures=50, featuresCol="features", outputCol=None, \</span> |
| <span class="sd"> labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, \</span> |
| <span class="sd"> fdr=0.05, fwe=0.05)</span> |
| <span class="sd"> Sets params for this ChiSqSelector.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ChiSqSelectorModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">ChiSqSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ChiSqSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.ChiSqSelectorModel.html#pyspark.ml.feature.ChiSqSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">ChiSqSelectorModel</span><span class="p">(</span><span class="n">_SelectorModel</span><span class="p">,</span> <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"ChiSqSelectorModel"</span><span class="p">],</span> <span class="n">JavaMLWritable</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`ChiSqSelector`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| <span class="sd"> """</span></div> |
| |
| |
| <div class="viewcode-block" id="VectorSizeHint"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">VectorSizeHint</span><span class="p">(</span> |
| <span class="n">JavaTransformer</span><span class="p">,</span> |
| <span class="n">HasInputCol</span><span class="p">,</span> |
| <span class="n">HasHandleInvalid</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VectorSizeHint"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A feature transformer that adds size information to the metadata of a vector column.</span> |
| <span class="sd"> VectorAssembler needs size information for its input columns and cannot be used on streaming</span> |
| <span class="sd"> dataframes without this metadata.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> VectorSizeHint modifies `inputCol` to include size metadata and does not have an outputCol.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> from pyspark.ml import Pipeline, PipelineModel</span> |
| <span class="sd"> >>> data = [(Vectors.dense([1., 2., 3.]), 4.)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["vector", "float"])</span> |
| <span class="sd"> >>></span> |
| <span class="sd"> >>> sizeHint = VectorSizeHint(inputCol="vector", size=3, handleInvalid="skip")</span> |
| <span class="sd"> >>> vecAssembler = VectorAssembler(inputCols=["vector", "float"], outputCol="assembled")</span> |
| <span class="sd"> >>> pipeline = Pipeline(stages=[sizeHint, vecAssembler])</span> |
| <span class="sd"> >>></span> |
| <span class="sd"> >>> pipelineModel = pipeline.fit(df)</span> |
| <span class="sd"> >>> pipelineModel.transform(df).head().assembled</span> |
| <span class="sd"> DenseVector([1.0, 2.0, 3.0, 4.0])</span> |
| <span class="sd"> >>> vectorSizeHintPath = temp_path + "/vector-size-hint-pipeline"</span> |
| <span class="sd"> >>> pipelineModel.save(vectorSizeHintPath)</span> |
| <span class="sd"> >>> loadedPipeline = PipelineModel.load(vectorSizeHintPath)</span> |
| <span class="sd"> >>> loaded = loadedPipeline.transform(df).head().assembled</span> |
| <span class="sd"> >>> expected = pipelineModel.transform(df).head().assembled</span> |
| <span class="sd"> >>> loaded == expected</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="n">size</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s2">"size"</span><span class="p">,</span> <span class="s2">"Size of vectors in column."</span><span class="p">,</span> <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toInt</span> |
| <span class="p">)</span> |
| |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"handleInvalid"</span><span class="p">,</span> |
| <span class="s2">"How to handle invalid vectors in inputCol. Invalid vectors include "</span> |
| <span class="s2">"nulls and vectors with the wrong size. The options are `skip` (filter "</span> |
| <span class="s2">"out rows with invalid vectors), `error` (throw an error) and "</span> |
| <span class="s2">"`optimistic` (do not check the vector size, and keep all rows). "</span> |
| <span class="s2">"`error` by default."</span><span class="p">,</span> |
| <span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, inputCol=None, size=None, handleInvalid="error")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">VectorSizeHint</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s2">"org.apache.spark.ml.feature.VectorSizeHint"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="s2">"error"</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="VectorSizeHint.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">inputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">handleInvalid</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"error"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSizeHint"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, inputCol=None, size=None, handleInvalid="error")</span> |
| <span class="sd"> Sets params for this VectorSizeHint.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSizeHint.getSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.getSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSize</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Gets size param, the size of vectors in `inputCol`."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">size</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSizeHint.setSize"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setSize">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"2.3.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSizeHint"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets size param, the size of vectors in `inputCol`."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSizeHint.setInputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setInputCol">[docs]</a> <span class="k">def</span> <span class="nf">setInputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSizeHint"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`inputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VectorSizeHint.setHandleInvalid"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VectorSizeHint.html#pyspark.ml.feature.VectorSizeHint.setHandleInvalid">[docs]</a> <span class="k">def</span> <span class="nf">setHandleInvalid</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VectorSizeHint"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`handleInvalid`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">handleInvalid</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_VarianceThresholdSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`VarianceThresholdSelector` and</span> |
| <span class="sd"> :py:class:`VarianceThresholdSelectorModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">varianceThreshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"varianceThreshold"</span><span class="p">,</span> |
| <span class="s2">"Param for variance threshold. Features with a variance not "</span> |
| <span class="o">+</span> <span class="s2">"greater than this threshold will be removed. The default value "</span> |
| <span class="o">+</span> <span class="s2">"is 0.0."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of varianceThreshold or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">varianceThreshold</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="VarianceThresholdSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">VarianceThresholdSelector</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">],</span> |
| <span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VarianceThresholdSelector"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Feature selector that removes all low-variance features. Features with a</span> |
| <span class="sd"> (sample) variance not greater than the threshold will be removed. The default is to keep</span> |
| <span class="sd"> all features with non-zero variance, i.e. remove the features that have the</span> |
| <span class="sd"> same value in all samples.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(Vectors.dense([6.0, 7.0, 0.0, 7.0, 6.0, 0.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 9.0, 6.0, 0.0, 5.0, 9.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 9.0, 3.0, 0.0, 5.0, 5.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([0.0, 9.0, 8.0, 5.0, 6.0, 4.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 5.0, 4.0, 4.0]),),</span> |
| <span class="sd"> ... (Vectors.dense([8.0, 9.0, 6.0, 0.0, 0.0, 0.0]),)],</span> |
| <span class="sd"> ... ["features"])</span> |
| <span class="sd"> >>> selector = VarianceThresholdSelector(varianceThreshold=8.2, outputCol="selectedFeatures")</span> |
| <span class="sd"> >>> model = selector.fit(df)</span> |
| <span class="sd"> >>> model.getFeaturesCol()</span> |
| <span class="sd"> 'features'</span> |
| <span class="sd"> >>> model.setFeaturesCol("features")</span> |
| <span class="sd"> VarianceThresholdSelectorModel...</span> |
| <span class="sd"> >>> model.transform(df).head().selectedFeatures</span> |
| <span class="sd"> DenseVector([6.0, 7.0, 0.0])</span> |
| <span class="sd"> >>> model.selectedFeatures</span> |
| <span class="sd"> [0, 3, 5]</span> |
| <span class="sd"> >>> varianceThresholdSelectorPath = temp_path + "/variance-threshold-selector"</span> |
| <span class="sd"> >>> selector.save(varianceThresholdSelectorPath)</span> |
| <span class="sd"> >>> loadedSelector = VarianceThresholdSelector.load(varianceThresholdSelectorPath)</span> |
| <span class="sd"> >>> loadedSelector.getVarianceThreshold() == selector.getVarianceThreshold()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/variance-threshold-selector-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = VarianceThresholdSelectorModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.selectedFeatures == model.selectedFeatures</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">varianceThreshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", outputCol=None, varianceThreshold=0.0)</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">VarianceThresholdSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.VarianceThresholdSelector"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">varianceThreshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", outputCol=None, varianceThreshold=0.0)</span> |
| <span class="sd"> Sets params for this VarianceThresholdSelector.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelector.setVarianceThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setVarianceThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setVarianceThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`varianceThreshold`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">varianceThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelector.html#pyspark.ml.feature.VarianceThresholdSelector.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">VarianceThresholdSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="VarianceThresholdSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">VarianceThresholdSelectorModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_VarianceThresholdSelectorParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`VarianceThresholdSelector`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="VarianceThresholdSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.VarianceThresholdSelectorModel.html#pyspark.ml.feature.VarianceThresholdSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"VarianceThresholdSelectorModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.0"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> List of indices to select (filter).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"selectedFeatures"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">_UnivariateFeatureSelectorParams</span><span class="p">(</span><span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Params for :py:class:`UnivariateFeatureSelector` and</span> |
| <span class="sd"> :py:class:`UnivariateFeatureSelectorModel`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">featureType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"featureType"</span><span class="p">,</span> |
| <span class="s2">"The feature type. "</span> <span class="o">+</span> <span class="s2">"Supported options: categorical, continuous."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">labelType</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"labelType"</span><span class="p">,</span> |
| <span class="s2">"The label type. "</span> <span class="o">+</span> <span class="s2">"Supported options: categorical, continuous."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">selectionMode</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"selectionMode"</span><span class="p">,</span> |
| <span class="s2">"The selection mode. "</span> |
| <span class="o">+</span> <span class="s2">"Supported options: numTopFeatures (default), percentile, fpr, "</span> |
| <span class="o">+</span> <span class="s2">"fdr, fwe."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toString</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">selectionThreshold</span><span class="p">:</span> <span class="n">Param</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span> |
| <span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> |
| <span class="s2">"selectionThreshold"</span><span class="p">,</span> |
| <span class="s2">"The upper bound of the "</span> <span class="o">+</span> <span class="s2">"features that selector will select."</span><span class="p">,</span> |
| <span class="n">typeConverter</span><span class="o">=</span><span class="n">TypeConverters</span><span class="o">.</span><span class="n">toFloat</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="s2">"numTopFeatures"</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of featureType or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">featureType</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of labelType or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">labelType</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of selectionMode or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionMode</span><span class="p">)</span> |
| |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">getSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gets the value of selectionThreshold or its default value.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">selectionThreshold</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector">[docs]</a><span class="nd">@inherit_doc</span> |
| <span class="k">class</span> <span class="nc">UnivariateFeatureSelector</span><span class="p">(</span> |
| <span class="n">JavaEstimator</span><span class="p">[</span><span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">],</span> |
| <span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"UnivariateFeatureSelector"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> UnivariateFeatureSelector</span> |
| <span class="sd"> Feature selector based on univariate statistical tests against labels. Currently, Spark</span> |
| <span class="sd"> supports three Univariate Feature Selectors: chi-squared, ANOVA F-test and F-value.</span> |
| <span class="sd"> User can choose Univariate Feature Selector by setting `featureType` and `labelType`,</span> |
| <span class="sd"> and Spark will pick the score function based on the specified `featureType` and `labelType`.</span> |
| |
| <span class="sd"> The following combination of `featureType` and `labelType` are supported:</span> |
| |
| <span class="sd"> - `featureType` `categorical` and `labelType` `categorical`, Spark uses chi-squared,</span> |
| <span class="sd"> i.e. chi2 in sklearn.</span> |
| <span class="sd"> - `featureType` `continuous` and `labelType` `categorical`, Spark uses ANOVA F-test,</span> |
| <span class="sd"> i.e. f_classif in sklearn.</span> |
| <span class="sd"> - `featureType` `continuous` and `labelType` `continuous`, Spark uses F-value,</span> |
| <span class="sd"> i.e. f_regression in sklearn.</span> |
| |
| <span class="sd"> The `UnivariateFeatureSelector` supports different selection modes: `numTopFeatures`,</span> |
| <span class="sd"> `percentile`, `fpr`, `fdr`, `fwe`.</span> |
| |
| <span class="sd"> - `numTopFeatures` chooses a fixed number of top features according to a according to a</span> |
| <span class="sd"> hypothesis.</span> |
| <span class="sd"> - `percentile` is similar but chooses a fraction of all features</span> |
| <span class="sd"> instead of a fixed number.</span> |
| <span class="sd"> - `fpr` chooses all features whose p-values are below a threshold,</span> |
| <span class="sd"> thus controlling the false positive rate of selection.</span> |
| <span class="sd"> - `fdr` uses the `Benjamini-Hochberg procedure \</span> |
| <span class="sd"> <https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure>`_</span> |
| <span class="sd"> to choose all features whose false discovery rate is below a threshold.</span> |
| <span class="sd"> - `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by</span> |
| <span class="sd"> 1 / `numFeatures`, thus controlling the family-wise error rate of selection.</span> |
| |
| <span class="sd"> By default, the selection mode is `numTopFeatures`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.1</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.ml.linalg import Vectors</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(Vectors.dense([1.7, 4.4, 7.6, 5.8, 9.6, 2.3]), 3.0),</span> |
| <span class="sd"> ... (Vectors.dense([8.8, 7.3, 5.7, 7.3, 2.2, 4.1]), 2.0),</span> |
| <span class="sd"> ... (Vectors.dense([1.2, 9.5, 2.5, 3.1, 8.7, 2.5]), 1.0),</span> |
| <span class="sd"> ... (Vectors.dense([3.7, 9.2, 6.1, 4.1, 7.5, 3.8]), 2.0),</span> |
| <span class="sd"> ... (Vectors.dense([8.9, 5.2, 7.8, 8.3, 5.2, 3.0]), 4.0),</span> |
| <span class="sd"> ... (Vectors.dense([7.9, 8.5, 9.2, 4.0, 9.4, 2.1]), 4.0)],</span> |
| <span class="sd"> ... ["features", "label"])</span> |
| <span class="sd"> >>> selector = UnivariateFeatureSelector(outputCol="selectedFeatures")</span> |
| <span class="sd"> >>> selector.setFeatureType("continuous").setLabelType("categorical").setSelectionThreshold(1)</span> |
| <span class="sd"> UnivariateFeatureSelector...</span> |
| <span class="sd"> >>> model = selector.fit(df)</span> |
| <span class="sd"> >>> model.getFeaturesCol()</span> |
| <span class="sd"> 'features'</span> |
| <span class="sd"> >>> model.setFeaturesCol("features")</span> |
| <span class="sd"> UnivariateFeatureSelectorModel...</span> |
| <span class="sd"> >>> model.transform(df).head().selectedFeatures</span> |
| <span class="sd"> DenseVector([7.6])</span> |
| <span class="sd"> >>> model.selectedFeatures</span> |
| <span class="sd"> [2]</span> |
| <span class="sd"> >>> selectorPath = temp_path + "/selector"</span> |
| <span class="sd"> >>> selector.save(selectorPath)</span> |
| <span class="sd"> >>> loadedSelector = UnivariateFeatureSelector.load(selectorPath)</span> |
| <span class="sd"> >>> loadedSelector.getSelectionThreshold() == selector.getSelectionThreshold()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> modelPath = temp_path + "/selector-model"</span> |
| <span class="sd"> >>> model.save(modelPath)</span> |
| <span class="sd"> >>> loadedModel = UnivariateFeatureSelectorModel.load(modelPath)</span> |
| <span class="sd"> >>> loadedModel.selectedFeatures == model.selectedFeatures</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> loadedModel.transform(df).take(1) == model.transform(df).take(1)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_input_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> |
| |
| <span class="nd">@keyword_only</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">selectionMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> __init__(self, \\*, featuresCol="features", outputCol=None, \</span> |
| <span class="sd"> labelCol="label", selectionMode="numTopFeatures")</span> |
| <span class="sd"> """</span> |
| <span class="nb">super</span><span class="p">(</span><span class="n">UnivariateFeatureSelector</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span> |
| <span class="s2">"org.apache.spark.ml.feature.UnivariateFeatureSelector"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span> |
| <span class="p">)</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setParams"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setParams">[docs]</a> <span class="nd">@keyword_only</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"label"</span><span class="p">,</span> |
| <span class="n">selectionMode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"numTopFeatures"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> setParams(self, \\*, featuresCol="features", outputCol=None, \</span> |
| <span class="sd"> labelCol="label", selectionMode="numTopFeatures")</span> |
| <span class="sd"> Sets params for this UnivariateFeatureSelector.</span> |
| <span class="sd"> """</span> |
| <span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_input_kwargs</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setFeatureType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeatureType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeatureType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featureType`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featureType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setLabelType"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelType">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setLabelType</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`labelType`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelType</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionMode"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionMode">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSelectionMode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`selectionMode`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionMode</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setSelectionThreshold"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setSelectionThreshold">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setSelectionThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`selectionThreshold`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">selectionThreshold</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setFeaturesCol">[docs]</a> <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setOutputCol">[docs]</a> <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelector.setLabelCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html#pyspark.ml.feature.UnivariateFeatureSelector.setLabelCol">[docs]</a> <span class="k">def</span> <span class="nf">setLabelCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelector"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`labelCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">labelCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">UnivariateFeatureSelectorModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelectorModel"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel">[docs]</a><span class="k">class</span> <span class="nc">UnivariateFeatureSelectorModel</span><span class="p">(</span> |
| <span class="n">JavaModel</span><span class="p">,</span> |
| <span class="n">_UnivariateFeatureSelectorParams</span><span class="p">,</span> |
| <span class="n">JavaMLReadable</span><span class="p">[</span><span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">],</span> |
| <span class="n">JavaMLWritable</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Model fitted by :py:class:`UnivariateFeatureSelector`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.1</span> |
| <span class="sd"> """</span> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelectorModel.setFeaturesCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setFeaturesCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setFeaturesCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`featuresCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">featuresCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="UnivariateFeatureSelectorModel.setOutputCol"><a class="viewcode-back" href="../../../reference/api/pyspark.ml.feature.UnivariateFeatureSelectorModel.html#pyspark.ml.feature.UnivariateFeatureSelectorModel.setOutputCol">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">setOutputCol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UnivariateFeatureSelectorModel"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Sets the value of :py:attr:`outputCol`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="n">outputCol</span><span class="o">=</span><span class="n">value</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s2">"3.1.1"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">selectedFeatures</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> List of indices to select (filter).</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s2">"selectedFeatures"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">tempfile</span> |
| |
| <span class="kn">import</span> <span class="nn">pyspark.ml.feature</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SparkSession</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="nb">globals</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">features</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">ml</span><span class="o">.</span><span class="n">feature</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="c1"># The small batch size here ensures that we see multiple batches,</span> |
| <span class="c1"># even in these small test examples:</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[2]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"ml.feature tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"spark"</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span> |
| <span class="n">testData</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"a"</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"b"</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"c"</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"a"</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"a"</span><span class="p">),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"c"</span><span class="p">),</span> |
| <span class="p">],</span> |
| <span class="mi">2</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"stringIndDf"</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span> |
| <span class="n">temp_path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"temp_path"</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp_path</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">finally</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">shutil</span> <span class="kn">import</span> <span class="n">rmtree</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">rmtree</span><span class="p">(</span><span class="n">temp_path</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |