|  |  | 
|  | <!DOCTYPE html> | 
|  |  | 
|  | <html> | 
|  | <head> | 
|  | <meta charset="utf-8" /> | 
|  | <title>pyspark.rdd — PySpark 3.5.4 documentation</title> | 
|  |  | 
|  | <link href="../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> | 
|  | <link href="../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> | 
|  |  | 
|  |  | 
|  | <link rel="stylesheet" | 
|  | href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> | 
|  | <link rel="preload" as="font" type="font/woff2" crossorigin | 
|  | href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> | 
|  | <link rel="preload" as="font" type="font/woff2" crossorigin | 
|  | href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | <link rel="stylesheet" href="../../_static/styles/pydata-sphinx-theme.css" type="text/css" /> | 
|  | <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" /> | 
|  | <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" /> | 
|  | <link rel="stylesheet" type="text/css" href="../../_static/css/pyspark.css" /> | 
|  |  | 
|  | <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"> | 
|  |  | 
|  | <script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script> | 
|  | <script src="../../_static/jquery.js"></script> | 
|  | <script src="../../_static/underscore.js"></script> | 
|  | <script src="../../_static/doctools.js"></script> | 
|  | <script src="../../_static/language_data.js"></script> | 
|  | <script src="../../_static/clipboard.min.js"></script> | 
|  | <script src="../../_static/copybutton.js"></script> | 
|  | <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> | 
|  | <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> | 
|  | <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> | 
|  | <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/rdd.html" /> | 
|  | <link rel="search" title="Search" href="../../search.html" /> | 
|  | <meta name="viewport" content="width=device-width, initial-scale=1" /> | 
|  | <meta name="docsearch:language" content="None"> | 
|  |  | 
|  |  | 
|  | <!-- Google Analytics --> | 
|  |  | 
|  | </head> | 
|  | <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> | 
|  |  | 
|  | <div class="container-fluid" id="banner"></div> | 
|  |  | 
|  |  | 
|  | <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl"> | 
|  |  | 
|  | <div id="navbar-start"> | 
|  |  | 
|  |  | 
|  |  | 
|  | <a class="navbar-brand" href="../../index.html"> | 
|  | <img src="../../_static/spark-logo-reverse.png" class="logo" alt="logo"> | 
|  | </a> | 
|  |  | 
|  |  | 
|  |  | 
|  | </div> | 
|  |  | 
|  | <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation"> | 
|  | <span class="navbar-toggler-icon"></span> | 
|  | </button> | 
|  |  | 
|  |  | 
|  | <div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse"> | 
|  | <div id="navbar-center" class="mr-auto"> | 
|  |  | 
|  | <div class="navbar-center-item"> | 
|  | <ul id="navbar-main-elements" class="navbar-nav"> | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../index.html"> | 
|  | Overview | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../getting_started/index.html"> | 
|  | Getting Started | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../user_guide/index.html"> | 
|  | User Guides | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../reference/index.html"> | 
|  | API Reference | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../development/index.html"> | 
|  | Development | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  | <li class="toctree-l1 nav-item"> | 
|  | <a class="reference internal nav-link" href="../../migration_guide/index.html"> | 
|  | Migration Guides | 
|  | </a> | 
|  | </li> | 
|  |  | 
|  |  | 
|  | </ul> | 
|  | </div> | 
|  |  | 
|  | </div> | 
|  |  | 
|  | <div id="navbar-end"> | 
|  |  | 
|  | <div class="navbar-end-item"> | 
|  | <!-- | 
|  | Licensed to the Apache Software Foundation (ASF) under one or more | 
|  | contributor license agreements.  See the NOTICE file distributed with | 
|  | this work for additional information regarding copyright ownership. | 
|  | The ASF licenses this file to You under the Apache License, Version 2.0 | 
|  | (the "License"); you may not use this file except in compliance with | 
|  | the License.  You may obtain a copy of the License at | 
|  |  | 
|  | http://www.apache.org/licenses/LICENSE-2.0 | 
|  |  | 
|  | Unless required by applicable law or agreed to in writing, software | 
|  | distributed under the License is distributed on an "AS IS" BASIS, | 
|  | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | See the License for the specific language governing permissions and | 
|  | limitations under the License. | 
|  | --> | 
|  |  | 
|  | <div id="version-button" class="dropdown"> | 
|  | <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> | 
|  | 3.5.4 | 
|  | <span class="caret"></span> | 
|  | </button> | 
|  | <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> | 
|  | <!-- dropdown will be populated by javascript on page load --> | 
|  | </div> | 
|  | </div> | 
|  |  | 
|  | <script type="text/javascript"> | 
|  | // Function to construct the target URL from the JSON components | 
|  | function buildURL(entry) { | 
|  | var template = "https://spark.apache.org/docs/{version}/api/python/index.html";  // supplied by jinja | 
|  | template = template.replace("{version}", entry.version); | 
|  | return template; | 
|  | } | 
|  |  | 
|  | // Function to check if corresponding page path exists in other version of docs | 
|  | // and, if so, go there instead of the homepage of the other docs version | 
|  | function checkPageExistsAndRedirect(event) { | 
|  | const currentFilePath = "_modules/pyspark/rdd.html", | 
|  | otherDocsHomepage = event.target.getAttribute("href"); | 
|  | let tryUrl = `${otherDocsHomepage}${currentFilePath}`; | 
|  | $.ajax({ | 
|  | type: 'HEAD', | 
|  | url: tryUrl, | 
|  | // if the page exists, go there | 
|  | success: function() { | 
|  | location.href = tryUrl; | 
|  | } | 
|  | }).fail(function() { | 
|  | location.href = otherDocsHomepage; | 
|  | }); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Function to populate the version switcher | 
|  | (function () { | 
|  | // get JSON config | 
|  | $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { | 
|  | // create the nodes first (before AJAX calls) to ensure the order is | 
|  | // correct (for now, links will go to doc version homepage) | 
|  | $.each(data, function(index, entry) { | 
|  | // if no custom name specified (e.g., "latest"), use version string | 
|  | if (!("name" in entry)) { | 
|  | entry.name = entry.version; | 
|  | } | 
|  | // construct the appropriate URL, and add it to the dropdown | 
|  | entry.url = buildURL(entry); | 
|  | const node = document.createElement("a"); | 
|  | node.setAttribute("class", "list-group-item list-group-item-action py-1"); | 
|  | node.setAttribute("href", `${entry.url}`); | 
|  | node.textContent = `${entry.name}`; | 
|  | node.onclick = checkPageExistsAndRedirect; | 
|  | $("#version_switcher").append(node); | 
|  | }); | 
|  | }); | 
|  | })(); | 
|  | </script> | 
|  | </div> | 
|  |  | 
|  | </div> | 
|  | </div> | 
|  | </div> | 
|  | </nav> | 
|  |  | 
|  |  | 
|  | <div class="container-xl"> | 
|  | <div class="row"> | 
|  |  | 
|  |  | 
|  | <!-- Only show if we have sidebars configured, else just a small margin  --> | 
|  | <div class="col-12 col-md-3 bd-sidebar"> | 
|  | <div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../search.html" method="get"> | 
|  | <i class="icon fas fa-search"></i> | 
|  | <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > | 
|  | </form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> | 
|  | <div class="bd-toc-item active"> | 
|  |  | 
|  | </div> | 
|  | </nav> | 
|  | </div> | 
|  | <div class="sidebar-end-items"> | 
|  | </div> | 
|  | </div> | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | <div class="d-none d-xl-block col-xl-2 bd-toc"> | 
|  |  | 
|  | </div> | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> | 
|  |  | 
|  | <div> | 
|  |  | 
|  | <h1>Source code for pyspark.rdd</h1><div class="highlight"><pre> | 
|  | <span></span><span class="c1">#</span> | 
|  | <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> | 
|  | <span class="c1"># contributor license agreements.  See the NOTICE file distributed with</span> | 
|  | <span class="c1"># this work for additional information regarding copyright ownership.</span> | 
|  | <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> | 
|  | <span class="c1"># (the "License"); you may not use this file except in compliance with</span> | 
|  | <span class="c1"># the License.  You may obtain a copy of the License at</span> | 
|  | <span class="c1">#</span> | 
|  | <span class="c1">#    http://www.apache.org/licenses/LICENSE-2.0</span> | 
|  | <span class="c1">#</span> | 
|  | <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> | 
|  | <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> | 
|  | <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> | 
|  | <span class="c1"># See the License for the specific language governing permissions and</span> | 
|  | <span class="c1"># limitations under the License.</span> | 
|  | <span class="c1">#</span> | 
|  |  | 
|  | <span class="kn">import</span> <span class="nn">copy</span> | 
|  | <span class="kn">import</span> <span class="nn">sys</span> | 
|  | <span class="kn">import</span> <span class="nn">os</span> | 
|  | <span class="kn">import</span> <span class="nn">operator</span> | 
|  | <span class="kn">import</span> <span class="nn">shlex</span> | 
|  | <span class="kn">import</span> <span class="nn">warnings</span> | 
|  | <span class="kn">import</span> <span class="nn">heapq</span> | 
|  | <span class="kn">import</span> <span class="nn">bisect</span> | 
|  | <span class="kn">import</span> <span class="nn">random</span> | 
|  | <span class="kn">from</span> <span class="nn">subprocess</span> <span class="kn">import</span> <span class="n">Popen</span><span class="p">,</span> <span class="n">PIPE</span> | 
|  | <span class="kn">from</span> <span class="nn">threading</span> <span class="kn">import</span> <span class="n">Thread</span> | 
|  | <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span> | 
|  | <span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span> | 
|  | <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span> | 
|  | <span class="kn">from</span> <span class="nn">math</span> <span class="kn">import</span> <span class="n">sqrt</span><span class="p">,</span> <span class="n">log</span><span class="p">,</span> <span class="n">isinf</span><span class="p">,</span> <span class="n">isnan</span><span class="p">,</span> <span class="nb">pow</span><span class="p">,</span> <span class="n">ceil</span> | 
|  | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">Any</span><span class="p">,</span> | 
|  | <span class="n">Callable</span><span class="p">,</span> | 
|  | <span class="n">Dict</span><span class="p">,</span> | 
|  | <span class="n">Generic</span><span class="p">,</span> | 
|  | <span class="n">Hashable</span><span class="p">,</span> | 
|  | <span class="n">Iterable</span><span class="p">,</span> | 
|  | <span class="n">Iterator</span><span class="p">,</span> | 
|  | <span class="n">IO</span><span class="p">,</span> | 
|  | <span class="n">List</span><span class="p">,</span> | 
|  | <span class="n">NoReturn</span><span class="p">,</span> | 
|  | <span class="n">Optional</span><span class="p">,</span> | 
|  | <span class="n">Sequence</span><span class="p">,</span> | 
|  | <span class="n">Tuple</span><span class="p">,</span> | 
|  | <span class="n">Union</span><span class="p">,</span> | 
|  | <span class="n">TypeVar</span><span class="p">,</span> | 
|  | <span class="n">cast</span><span class="p">,</span> | 
|  | <span class="n">overload</span><span class="p">,</span> | 
|  | <span class="n">TYPE_CHECKING</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="kn">from</span> <span class="nn">pyspark.java_gateway</span> <span class="kn">import</span> <span class="n">local_connect_and_auth</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.serializers</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">AutoBatchedSerializer</span><span class="p">,</span> | 
|  | <span class="n">BatchedSerializer</span><span class="p">,</span> | 
|  | <span class="n">NoOpSerializer</span><span class="p">,</span> | 
|  | <span class="n">CartesianDeserializer</span><span class="p">,</span> | 
|  | <span class="n">CloudPickleSerializer</span><span class="p">,</span> | 
|  | <span class="n">PairDeserializer</span><span class="p">,</span> | 
|  | <span class="n">CPickleSerializer</span><span class="p">,</span> | 
|  | <span class="n">Serializer</span><span class="p">,</span> | 
|  | <span class="n">pack_long</span><span class="p">,</span> | 
|  | <span class="n">read_int</span><span class="p">,</span> | 
|  | <span class="n">write_int</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.join</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">python_join</span><span class="p">,</span> | 
|  | <span class="n">python_left_outer_join</span><span class="p">,</span> | 
|  | <span class="n">python_right_outer_join</span><span class="p">,</span> | 
|  | <span class="n">python_full_outer_join</span><span class="p">,</span> | 
|  | <span class="n">python_cogroup</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.statcounter</span> <span class="kn">import</span> <span class="n">StatCounter</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.rddsampler</span> <span class="kn">import</span> <span class="n">RDDSampler</span><span class="p">,</span> <span class="n">RDDRangeSampler</span><span class="p">,</span> <span class="n">RDDStratifiedSampler</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.storagelevel</span> <span class="kn">import</span> <span class="n">StorageLevel</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.resource.requests</span> <span class="kn">import</span> <span class="n">ExecutorResourceRequests</span><span class="p">,</span> <span class="n">TaskResourceRequests</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.resource.profile</span> <span class="kn">import</span> <span class="n">ResourceProfile</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.resultiterable</span> <span class="kn">import</span> <span class="n">ResultIterable</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.shuffle</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">Aggregator</span><span class="p">,</span> | 
|  | <span class="n">ExternalMerger</span><span class="p">,</span> | 
|  | <span class="n">get_used_memory</span><span class="p">,</span> | 
|  | <span class="n">ExternalSorter</span><span class="p">,</span> | 
|  | <span class="n">ExternalGroupBy</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.traceback_utils</span> <span class="kn">import</span> <span class="n">SCCallSiteSync</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.util</span> <span class="kn">import</span> <span class="n">fail_on_stopiteration</span><span class="p">,</span> <span class="n">_parse_memory</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkRuntimeError</span> | 
|  |  | 
|  |  | 
|  | <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> | 
|  | <span class="kn">import</span> <span class="nn">socket</span> | 
|  | <span class="kn">import</span> <span class="nn">io</span> | 
|  |  | 
|  | <span class="kn">from</span> <span class="nn">pyspark._typing</span> <span class="kn">import</span> <span class="n">NonUDFType</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark._typing</span> <span class="kn">import</span> <span class="n">S</span><span class="p">,</span> <span class="n">NumberOrArray</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.sql.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">PandasScalarUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasGroupedMapUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasGroupedAggUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasWindowAggUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasScalarIterUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasMapIterUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasCogroupedMapUDFType</span><span class="p">,</span> | 
|  | <span class="n">ArrowMapIterUDFType</span><span class="p">,</span> | 
|  | <span class="n">PandasGroupedMapUDFWithStateType</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">AtomicType</span><span class="p">,</span> <span class="n">StructType</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="p">(</span> | 
|  | <span class="n">AtomicValue</span><span class="p">,</span> | 
|  | <span class="n">RowLike</span><span class="p">,</span> | 
|  | <span class="n">SQLArrowBatchedUDFType</span><span class="p">,</span> | 
|  | <span class="n">SQLArrowTableUDFType</span><span class="p">,</span> | 
|  | <span class="n">SQLBatchedUDFType</span><span class="p">,</span> | 
|  | <span class="n">SQLTableUDFType</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> | 
|  | <span class="kn">from</span> <span class="nn">py4j.java_collections</span> <span class="kn">import</span> <span class="n">JavaArray</span> | 
|  |  | 
|  | <span class="n">T</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"T"</span><span class="p">)</span> | 
|  | <span class="n">T_co</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"T_co"</span><span class="p">,</span> <span class="n">covariant</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | 
|  | <span class="n">U</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"U"</span><span class="p">)</span> | 
|  | <span class="n">K</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"K"</span><span class="p">,</span> <span class="n">bound</span><span class="o">=</span><span class="n">Hashable</span><span class="p">)</span> | 
|  | <span class="n">V</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"V"</span><span class="p">)</span> | 
|  | <span class="n">V1</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"V1"</span><span class="p">)</span> | 
|  | <span class="n">V2</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"V2"</span><span class="p">)</span> | 
|  | <span class="n">V3</span> <span class="o">=</span> <span class="n">TypeVar</span><span class="p">(</span><span class="s2">"V3"</span><span class="p">)</span> | 
|  |  | 
|  |  | 
|  | <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"RDD"</span><span class="p">]</span> | 
|  |  | 
|  |  | 
|  | <span class="k">class</span> <span class="nc">PythonEvalType</span><span class="p">:</span> | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Evaluation type of python rdd.</span> | 
|  |  | 
|  | <span class="sd">    These values are internal to PySpark.</span> | 
|  |  | 
|  | <span class="sd">    These values should match values in org.apache.spark.api.python.PythonEvalType.</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="n">NON_UDF</span><span class="p">:</span> <span class="s2">"NonUDFType"</span> <span class="o">=</span> <span class="mi">0</span> | 
|  |  | 
|  | <span class="n">SQL_BATCHED_UDF</span><span class="p">:</span> <span class="s2">"SQLBatchedUDFType"</span> <span class="o">=</span> <span class="mi">100</span> | 
|  | <span class="n">SQL_ARROW_BATCHED_UDF</span><span class="p">:</span> <span class="s2">"SQLArrowBatchedUDFType"</span> <span class="o">=</span> <span class="mi">101</span> | 
|  |  | 
|  | <span class="n">SQL_SCALAR_PANDAS_UDF</span><span class="p">:</span> <span class="s2">"PandasScalarUDFType"</span> <span class="o">=</span> <span class="mi">200</span> | 
|  | <span class="n">SQL_GROUPED_MAP_PANDAS_UDF</span><span class="p">:</span> <span class="s2">"PandasGroupedMapUDFType"</span> <span class="o">=</span> <span class="mi">201</span> | 
|  | <span class="n">SQL_GROUPED_AGG_PANDAS_UDF</span><span class="p">:</span> <span class="s2">"PandasGroupedAggUDFType"</span> <span class="o">=</span> <span class="mi">202</span> | 
|  | <span class="n">SQL_WINDOW_AGG_PANDAS_UDF</span><span class="p">:</span> <span class="s2">"PandasWindowAggUDFType"</span> <span class="o">=</span> <span class="mi">203</span> | 
|  | <span class="n">SQL_SCALAR_PANDAS_ITER_UDF</span><span class="p">:</span> <span class="s2">"PandasScalarIterUDFType"</span> <span class="o">=</span> <span class="mi">204</span> | 
|  | <span class="n">SQL_MAP_PANDAS_ITER_UDF</span><span class="p">:</span> <span class="s2">"PandasMapIterUDFType"</span> <span class="o">=</span> <span class="mi">205</span> | 
|  | <span class="n">SQL_COGROUPED_MAP_PANDAS_UDF</span><span class="p">:</span> <span class="s2">"PandasCogroupedMapUDFType"</span> <span class="o">=</span> <span class="mi">206</span> | 
|  | <span class="n">SQL_MAP_ARROW_ITER_UDF</span><span class="p">:</span> <span class="s2">"ArrowMapIterUDFType"</span> <span class="o">=</span> <span class="mi">207</span> | 
|  | <span class="n">SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE</span><span class="p">:</span> <span class="s2">"PandasGroupedMapUDFWithStateType"</span> <span class="o">=</span> <span class="mi">208</span> | 
|  |  | 
|  | <span class="n">SQL_TABLE_UDF</span><span class="p">:</span> <span class="s2">"SQLTableUDFType"</span> <span class="o">=</span> <span class="mi">300</span> | 
|  | <span class="n">SQL_ARROW_TABLE_UDF</span><span class="p">:</span> <span class="s2">"SQLArrowTableUDFType"</span> <span class="o">=</span> <span class="mi">301</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">portable_hash</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Hashable</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    This function returns consistent hash code for builtin types, especially</span> | 
|  | <span class="sd">    for None and tuple with None.</span> | 
|  |  | 
|  | <span class="sd">    The algorithm is similar to that one used by CPython 2.7</span> | 
|  |  | 
|  | <span class="sd">    Examples</span> | 
|  | <span class="sd">    --------</span> | 
|  | <span class="sd">    >>> portable_hash(None)</span> | 
|  | <span class="sd">    0</span> | 
|  | <span class="sd">    >>> portable_hash((None, 1)) & 0xffffffff</span> | 
|  | <span class="sd">    219750521</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="s2">"PYTHONHASHSEED"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> | 
|  | <span class="n">error_class</span><span class="o">=</span><span class="s2">"PYTHON_HASH_SEED_NOT_SET"</span><span class="p">,</span> | 
|  | <span class="n">message_parameters</span><span class="o">=</span><span class="p">{},</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="mi">0</span> | 
|  | <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> | 
|  | <span class="n">h</span> <span class="o">=</span> <span class="mh">0x345678</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">x</span><span class="p">:</span> | 
|  | <span class="n">h</span> <span class="o">^=</span> <span class="n">portable_hash</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> | 
|  | <span class="n">h</span> <span class="o">*=</span> <span class="mi">1000003</span> | 
|  | <span class="n">h</span> <span class="o">&=</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span> | 
|  | <span class="n">h</span> <span class="o">^=</span> <span class="nb">len</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">h</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> | 
|  | <span class="n">h</span> <span class="o">=</span> <span class="o">-</span><span class="mi">2</span> | 
|  | <span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">h</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="nb">hash</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> | 
|  |  | 
|  |  | 
|  | <span class="k">class</span> <span class="nc">BoundedFloat</span><span class="p">(</span><span class="nb">float</span><span class="p">):</span> | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Bounded value is generated by approximate job, with confidence and low</span> | 
|  | <span class="sd">    bound and high bound.</span> | 
|  |  | 
|  | <span class="sd">    Examples</span> | 
|  | <span class="sd">    --------</span> | 
|  | <span class="sd">    >>> BoundedFloat(100.0, 0.95, 95.0, 105.0)</span> | 
|  | <span class="sd">    100.0</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="n">confidence</span><span class="p">:</span> <span class="nb">float</span> | 
|  | <span class="n">low</span><span class="p">:</span> <span class="nb">float</span> | 
|  | <span class="n">high</span><span class="p">:</span> <span class="nb">float</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__new__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">mean</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">confidence</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">low</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">high</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BoundedFloat"</span><span class="p">:</span> | 
|  | <span class="n">obj</span> <span class="o">=</span> <span class="nb">float</span><span class="o">.</span><span class="fm">__new__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">mean</span><span class="p">)</span> | 
|  | <span class="n">obj</span><span class="o">.</span><span class="n">confidence</span> <span class="o">=</span> <span class="n">confidence</span> | 
|  | <span class="n">obj</span><span class="o">.</span><span class="n">low</span> <span class="o">=</span> <span class="n">low</span> | 
|  | <span class="n">obj</span><span class="o">.</span><span class="n">high</span> <span class="o">=</span> <span class="n">high</span> | 
|  | <span class="k">return</span> <span class="n">obj</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_create_local_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">:</span> <span class="s2">"JavaArray"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"io.BufferedRWPair"</span><span class="p">:</span> | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Create a local socket that can be used to load deserialized data from the JVM</span> | 
|  |  | 
|  | <span class="sd">    Parameters</span> | 
|  | <span class="sd">    ----------</span> | 
|  | <span class="sd">    sock_info : tuple</span> | 
|  | <span class="sd">        Tuple containing port number and authentication secret for a local socket.</span> | 
|  |  | 
|  | <span class="sd">    Returns</span> | 
|  | <span class="sd">    -------</span> | 
|  | <span class="sd">    sockfile file descriptor of the local socket</span> | 
|  | <span class="sd">    """</span> | 
|  | <span class="n">sockfile</span><span class="p">:</span> <span class="s2">"io.BufferedRWPair"</span> | 
|  | <span class="n">sock</span><span class="p">:</span> <span class="s2">"socket.socket"</span> | 
|  | <span class="n">port</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">sock_info</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> | 
|  | <span class="n">auth_secret</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">sock_info</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> | 
|  | <span class="n">sockfile</span><span class="p">,</span> <span class="n">sock</span> <span class="o">=</span> <span class="n">local_connect_and_auth</span><span class="p">(</span><span class="n">port</span><span class="p">,</span> <span class="n">auth_secret</span><span class="p">)</span> | 
|  | <span class="c1"># The RDD materialization time is unpredictable, if we set a timeout for socket reading</span> | 
|  | <span class="c1"># operation, it will very possibly fail. See SPARK-18281.</span> | 
|  | <span class="n">sock</span><span class="o">.</span><span class="n">settimeout</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">sockfile</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">:</span> <span class="s2">"JavaArray"</span><span class="p">,</span> <span class="n">serializer</span><span class="p">:</span> <span class="n">Serializer</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Connect to a local socket described by sock_info and use the given serializer to yield data</span> | 
|  |  | 
|  | <span class="sd">    Parameters</span> | 
|  | <span class="sd">    ----------</span> | 
|  | <span class="sd">    sock_info : tuple</span> | 
|  | <span class="sd">        Tuple containing port number and authentication secret for a local socket.</span> | 
|  | <span class="sd">    serializer : class:`Serializer`</span> | 
|  | <span class="sd">        The PySpark serializer to use</span> | 
|  |  | 
|  | <span class="sd">    Returns</span> | 
|  | <span class="sd">    -------</span> | 
|  | <span class="sd">    result of meth:`Serializer.load_stream`,</span> | 
|  | <span class="sd">    usually a generator that yields deserialized data</span> | 
|  | <span class="sd">    """</span> | 
|  | <span class="n">sockfile</span> <span class="o">=</span> <span class="n">_create_local_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">)</span> | 
|  | <span class="c1"># The socket will be automatically closed when garbage-collected.</span> | 
|  | <span class="k">return</span> <span class="n">serializer</span><span class="o">.</span><span class="n">load_stream</span><span class="p">(</span><span class="n">sockfile</span><span class="p">)</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_local_iterator_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">:</span> <span class="s2">"JavaArray"</span><span class="p">,</span> <span class="n">serializer</span><span class="p">:</span> <span class="n">Serializer</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="k">class</span> <span class="nc">PyLocalIterable</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""Create a synchronous local iterable over a socket"""</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">_sock_info</span><span class="p">:</span> <span class="s2">"JavaArray"</span><span class="p">,</span> <span class="n">_serializer</span><span class="p">:</span> <span class="n">Serializer</span><span class="p">):</span> | 
|  | <span class="n">port</span><span class="p">:</span> <span class="nb">int</span> | 
|  | <span class="n">auth_secret</span><span class="p">:</span> <span class="nb">str</span> | 
|  | <span class="n">jsocket_auth_server</span><span class="p">:</span> <span class="s2">"JavaObject"</span> | 
|  | <span class="n">port</span><span class="p">,</span> <span class="n">auth_secret</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">jsocket_auth_server</span> <span class="o">=</span> <span class="n">_sock_info</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span> <span class="o">=</span> <span class="n">_create_local_socket</span><span class="p">((</span><span class="n">port</span><span class="p">,</span> <span class="n">auth_secret</span><span class="p">))</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_serializer</span> <span class="o">=</span> <span class="n">_serializer</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_read_iter</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="nb">iter</span><span class="p">([])</span>  <span class="c1"># Initialize as empty iterator</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">=</span> <span class="mi">1</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="k">while</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="c1"># Request next partition data from Java</span> | 
|  | <span class="n">write_int</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span> | 
|  |  | 
|  | <span class="c1"># If response is 1 then there is a partition to read, if 0 then fully consumed</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">=</span> <span class="n">read_int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> | 
|  |  | 
|  | <span class="c1"># Load the partition data as a stream and read each item</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_read_iter</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_serializer</span><span class="o">.</span><span class="n">load_stream</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="p">)</span> | 
|  | <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_iter</span><span class="p">:</span> | 
|  | <span class="k">yield</span> <span class="n">item</span> | 
|  |  | 
|  | <span class="c1"># An error occurred, join serving thread and raise any exceptions from the JVM</span> | 
|  | <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">jsocket_auth_server</span><span class="o">.</span><span class="n">getResult</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__del__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="c1"># If local iterator is not fully consumed,</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_status</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="c1"># Finish consuming partition data stream</span> | 
|  | <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_read_iter</span><span class="p">:</span> | 
|  | <span class="k">pass</span> | 
|  | <span class="c1"># Tell Java to stop sending data and close connection</span> | 
|  | <span class="n">write_int</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_sockfile</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span> | 
|  | <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span> | 
|  | <span class="c1"># Ignore any errors, socket is automatically closed when garbage-collected</span> | 
|  | <span class="k">pass</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">(</span><span class="n">PyLocalIterable</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="n">serializer</span><span class="p">))</span> | 
|  |  | 
|  |  | 
|  | <span class="k">class</span> <span class="nc">Partitioner</span><span class="p">:</span> | 
|  | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">int</span><span class="p">]):</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span> <span class="o">=</span> <span class="n">numPartitions</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">partitionFunc</span> <span class="o">=</span> <span class="n">partitionFunc</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Partitioner</span><span class="p">)</span> | 
|  | <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">numPartitions</span> | 
|  | <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitionFunc</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">partitionFunc</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitionFunc</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span> | 
|  |  | 
|  |  | 
|  | <div class="viewcode-block" id="RDD"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.html#pyspark.RDD">[docs]</a><span class="k">class</span> <span class="nc">RDD</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">T_co</span><span class="p">]):</span> | 
|  |  | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.</span> | 
|  | <span class="sd">    Represents an immutable, partitioned collection of elements that can be</span> | 
|  | <span class="sd">    operated on in parallel.</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">,</span> | 
|  | <span class="n">jrdd</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">,</span> | 
|  | <span class="n">ctx</span><span class="p">:</span> <span class="s2">"SparkContext"</span><span class="p">,</span> | 
|  | <span class="n">jrdd_deserializer</span><span class="p">:</span> <span class="n">Serializer</span> <span class="o">=</span> <span class="n">AutoBatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">()),</span> | 
|  | <span class="p">):</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span> <span class="o">=</span> <span class="n">jrdd</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_checkpointed</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">has_resource_profile</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span> <span class="o">=</span> <span class="n">ctx</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">=</span> <span class="n">jrdd_deserializer</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> <span class="o">=</span> <span class="n">jrdd</span><span class="o">.</span><span class="n">id</span><span class="p">()</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Partitioner</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_pickled</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">(</span><span class="n">AutoBatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">()))</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.id"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.id.html#pyspark.RDD.id">[docs]</a>    <span class="k">def</span> <span class="nf">id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        A unique ID for this RDD (within its SparkContext).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        int</span> | 
|  | <span class="sd">            The unique ID for this :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.id()  # doctest: +SKIP</span> | 
|  | <span class="sd">        3</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_id</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">toString</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">__getnewargs__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">NoReturn</span><span class="p">:</span> | 
|  | <span class="c1"># This method is called when attempting to pickle an RDD, which is always an error:</span> | 
|  | <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> | 
|  | <span class="n">error_class</span><span class="o">=</span><span class="s2">"RDD_TRANSFORM_ONLY_VALID_ON_DRIVER"</span><span class="p">,</span> | 
|  | <span class="n">message_parameters</span><span class="o">=</span><span class="p">{},</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="nd">@property</span> | 
|  | <span class="k">def</span> <span class="nf">context</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkContext"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        The :class:`SparkContext` that this RDD was created on.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`SparkContext`</span> | 
|  | <span class="sd">            The :class:`SparkContext` that this RDD was created on</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.context</span> | 
|  | <span class="sd">        <SparkContext ...></span> | 
|  | <span class="sd">        >>> rdd.context is sc</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.cache"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.cache.html#pyspark.RDD.cache">[docs]</a>    <span class="k">def</span> <span class="nf">cache</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Persist this RDD with the default storage level (`MEMORY_ONLY`).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            The same :class:`RDD` with storage level set to `MEMORY_ONLY`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.persist`</span> | 
|  | <span class="sd">        :meth:`RDD.unpersist`</span> | 
|  | <span class="sd">        :meth:`RDD.getStorageLevel`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd2 = rdd.cache()</span> | 
|  | <span class="sd">        >>> rdd2 is rdd</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> str(rdd.getStorageLevel())</span> | 
|  | <span class="sd">        'Memory Serialized 1x Replicated'</span> | 
|  | <span class="sd">        >>> _ = rdd.unpersist()</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">True</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">persist</span><span class="p">(</span><span class="n">StorageLevel</span><span class="o">.</span><span class="n">MEMORY_ONLY</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.persist"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.persist.html#pyspark.RDD.persist">[docs]</a>    <span class="k">def</span> <span class="nf">persist</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">storageLevel</span><span class="p">:</span> <span class="n">StorageLevel</span> <span class="o">=</span> <span class="n">StorageLevel</span><span class="o">.</span><span class="n">MEMORY_ONLY</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Set this RDD's storage level to persist its values across operations</span> | 
|  | <span class="sd">        after the first time it is computed. This can only be used to assign</span> | 
|  | <span class="sd">        a new storage level if the RDD does not have a storage level set yet.</span> | 
|  | <span class="sd">        If no storage level is specified defaults to (`MEMORY_ONLY`).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        storageLevel : :class:`StorageLevel`, default `MEMORY_ONLY`</span> | 
|  | <span class="sd">            the target storage level</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            The same :class:`RDD` with storage level set to `storageLevel`.</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.cache`</span> | 
|  | <span class="sd">        :meth:`RDD.unpersist`</span> | 
|  | <span class="sd">        :meth:`RDD.getStorageLevel`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(["b", "a", "c"])</span> | 
|  | <span class="sd">        >>> rdd.persist().is_cached</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> str(rdd.getStorageLevel())</span> | 
|  | <span class="sd">        'Memory Serialized 1x Replicated'</span> | 
|  | <span class="sd">        >>> _ = rdd.unpersist()</span> | 
|  | <span class="sd">        >>> rdd.is_cached</span> | 
|  | <span class="sd">        False</span> | 
|  |  | 
|  | <span class="sd">        >>> from pyspark import StorageLevel</span> | 
|  | <span class="sd">        >>> rdd2 = sc.range(5)</span> | 
|  | <span class="sd">        >>> _ = rdd2.persist(StorageLevel.MEMORY_AND_DISK)</span> | 
|  | <span class="sd">        >>> rdd2.is_cached</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> str(rdd2.getStorageLevel())</span> | 
|  | <span class="sd">        'Disk Memory Serialized 1x Replicated'</span> | 
|  |  | 
|  | <span class="sd">        Can not override existing storage level</span> | 
|  |  | 
|  | <span class="sd">        >>> _ = rdd2.persist(StorageLevel.MEMORY_ONLY_2)</span> | 
|  | <span class="sd">        Traceback (most recent call last):</span> | 
|  | <span class="sd">            ...</span> | 
|  | <span class="sd">        py4j.protocol.Py4JJavaError: ...</span> | 
|  |  | 
|  | <span class="sd">        Assign another storage level after `unpersist`</span> | 
|  |  | 
|  | <span class="sd">        >>> _ = rdd2.unpersist()</span> | 
|  | <span class="sd">        >>> rdd2.is_cached</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        >>> _ = rdd2.persist(StorageLevel.MEMORY_ONLY_2)</span> | 
|  | <span class="sd">        >>> str(rdd2.getStorageLevel())</span> | 
|  | <span class="sd">        'Memory Serialized 2x Replicated'</span> | 
|  | <span class="sd">        >>> rdd2.is_cached</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> _ = rdd2.unpersist()</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">True</span> | 
|  | <span class="n">javaStorageLevel</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_getJavaStorageLevel</span><span class="p">(</span><span class="n">storageLevel</span><span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">persist</span><span class="p">(</span><span class="n">javaStorageLevel</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.unpersist"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.unpersist.html#pyspark.RDD.unpersist">[docs]</a>    <span class="k">def</span> <span class="nf">unpersist</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">blocking</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Mark the RDD as non-persistent, and remove all blocks for it from</span> | 
|  | <span class="sd">        memory and disk.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        blocking : bool, optional, default False</span> | 
|  | <span class="sd">            whether to block until all blocks are deleted</span> | 
|  |  | 
|  | <span class="sd">            .. versionadded:: 3.0.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            The same :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.cache`</span> | 
|  | <span class="sd">        :meth:`RDD.persist`</span> | 
|  | <span class="sd">        :meth:`RDD.getStorageLevel`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.is_cached</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        >>> _ = rdd.unpersist()</span> | 
|  | <span class="sd">        >>> rdd.is_cached</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        >>> _ = rdd.cache()</span> | 
|  | <span class="sd">        >>> rdd.is_cached</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> _ = rdd.unpersist()</span> | 
|  | <span class="sd">        >>> rdd.is_cached</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        >>> _ = rdd.unpersist()</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">unpersist</span><span class="p">(</span><span class="n">blocking</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.checkpoint"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.checkpoint.html#pyspark.RDD.checkpoint">[docs]</a>    <span class="k">def</span> <span class="nf">checkpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Mark this RDD for checkpointing. It will be saved to a file inside the</span> | 
|  | <span class="sd">        checkpoint directory set with :meth:`SparkContext.setCheckpointDir` and</span> | 
|  | <span class="sd">        all references to its parent RDDs will be removed. This function must</span> | 
|  | <span class="sd">        be called before any job has been executed on this RDD. It is strongly</span> | 
|  | <span class="sd">        recommended that this RDD is persisted in memory, otherwise saving it</span> | 
|  | <span class="sd">        on a file will require recomputation.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.isCheckpointed`</span> | 
|  | <span class="sd">        :meth:`RDD.getCheckpointFile`</span> | 
|  | <span class="sd">        :meth:`RDD.localCheckpoint`</span> | 
|  | <span class="sd">        :meth:`SparkContext.setCheckpointDir`</span> | 
|  | <span class="sd">        :meth:`SparkContext.getCheckpointDir`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.is_checkpointed</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        >>> rdd.getCheckpointFile() == None</span> | 
|  | <span class="sd">        True</span> | 
|  |  | 
|  | <span class="sd">        >>> rdd.checkpoint()</span> | 
|  | <span class="sd">        >>> rdd.is_checkpointed</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> rdd.getCheckpointFile() == None</span> | 
|  | <span class="sd">        True</span> | 
|  |  | 
|  | <span class="sd">        >>> rdd.count()</span> | 
|  | <span class="sd">        5</span> | 
|  | <span class="sd">        >>> rdd.is_checkpointed</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> rdd.getCheckpointFile() == None</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_checkpointed</span> <span class="o">=</span> <span class="kc">True</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">checkpoint</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.isCheckpointed"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.isCheckpointed.html#pyspark.RDD.isCheckpointed">[docs]</a>    <span class="k">def</span> <span class="nf">isCheckpointed</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return whether this RDD is checkpointed and materialized, either reliably or locally.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        bool</span> | 
|  | <span class="sd">            whether this :class:`RDD` is checkpointed and materialized, either reliably or locally</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.checkpoint`</span> | 
|  | <span class="sd">        :meth:`RDD.getCheckpointFile`</span> | 
|  | <span class="sd">        :meth:`SparkContext.setCheckpointDir`</span> | 
|  | <span class="sd">        :meth:`SparkContext.getCheckpointDir`</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">isCheckpointed</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.localCheckpoint"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.localCheckpoint.html#pyspark.RDD.localCheckpoint">[docs]</a>    <span class="k">def</span> <span class="nf">localCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Mark this RDD for local checkpointing using Spark's existing caching layer.</span> | 
|  |  | 
|  | <span class="sd">        This method is for users who wish to truncate RDD lineages while skipping the expensive</span> | 
|  | <span class="sd">        step of replicating the materialized data in a reliable distributed file system. This is</span> | 
|  | <span class="sd">        useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX).</span> | 
|  |  | 
|  | <span class="sd">        Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed</span> | 
|  | <span class="sd">        data is written to ephemeral local storage in the executors instead of to a reliable,</span> | 
|  | <span class="sd">        fault-tolerant storage. The effect is that if an executor fails during the computation,</span> | 
|  | <span class="sd">        the checkpointed data may no longer be accessible, causing an irrecoverable job failure.</span> | 
|  |  | 
|  | <span class="sd">        This is NOT safe to use with dynamic allocation, which removes executors along</span> | 
|  | <span class="sd">        with their cached blocks. If you must use both features, you are advised to set</span> | 
|  | <span class="sd">        `spark.dynamicAllocation.cachedExecutorIdleTimeout` to a high value.</span> | 
|  |  | 
|  | <span class="sd">        The checkpoint directory set through :meth:`SparkContext.setCheckpointDir` is not used.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 2.2.0</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.checkpoint`</span> | 
|  | <span class="sd">        :meth:`RDD.isLocallyCheckpointed`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.isLocallyCheckpointed()</span> | 
|  | <span class="sd">        False</span> | 
|  |  | 
|  | <span class="sd">        >>> rdd.localCheckpoint()</span> | 
|  | <span class="sd">        >>> rdd.isLocallyCheckpointed()</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">localCheckpoint</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.isLocallyCheckpointed"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.isLocallyCheckpointed.html#pyspark.RDD.isLocallyCheckpointed">[docs]</a>    <span class="k">def</span> <span class="nf">isLocallyCheckpointed</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return whether this RDD is marked for local checkpointing.</span> | 
|  |  | 
|  | <span class="sd">        Exposed for testing.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 2.2.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        bool</span> | 
|  | <span class="sd">            whether this :class:`RDD` is marked for local checkpointing</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.localCheckpoint`</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">isLocallyCheckpointed</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.getCheckpointFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.getCheckpointFile.html#pyspark.RDD.getCheckpointFile">[docs]</a>    <span class="k">def</span> <span class="nf">getCheckpointFile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Gets the name of the file to which this RDD was checkpointed</span> | 
|  |  | 
|  | <span class="sd">        Not defined if RDD is checkpointed locally.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        str</span> | 
|  | <span class="sd">            the name of the file to which this :class:`RDD` was checkpointed</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.checkpoint`</span> | 
|  | <span class="sd">        :meth:`SparkContext.setCheckpointDir`</span> | 
|  | <span class="sd">        :meth:`SparkContext.getCheckpointDir`</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">checkpointFile</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">getCheckpointFile</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">checkpointFile</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> <span class="k">if</span> <span class="n">checkpointFile</span><span class="o">.</span><span class="n">isDefined</span><span class="p">()</span> <span class="k">else</span> <span class="kc">None</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.cleanShuffleDependencies"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.cleanShuffleDependencies.html#pyspark.RDD.cleanShuffleDependencies">[docs]</a>    <span class="k">def</span> <span class="nf">cleanShuffleDependencies</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">blocking</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Removes an RDD's shuffles and it's non-persisted ancestors.</span> | 
|  |  | 
|  | <span class="sd">        When running without a shuffle service, cleaning up shuffle files enables downscaling.</span> | 
|  | <span class="sd">        If you use the RDD after this call, you should checkpoint and materialize it first.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 3.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        blocking : bool, optional, default False</span> | 
|  | <span class="sd">           whether to block on shuffle cleanup tasks</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This API is a developer API.</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">cleanShuffleDependencies</span><span class="p">(</span><span class="n">blocking</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.map"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.map.html#pyspark.RDD.map">[docs]</a>    <span class="k">def</span> <span class="nf">map</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[U]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD by applying a function to each element of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to run on each element of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.flatMap`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitions`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithIndex`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithSplit`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(["b", "a", "c"])</span> | 
|  | <span class="sd">        >>> sorted(rdd.map(lambda x: (x, 1)).collect())</span> | 
|  | <span class="sd">        [('a', 1), ('b', 1), ('c', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">_</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">f</span><span class="p">),</span> <span class="n">iterator</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.flatMap"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.flatMap.html#pyspark.RDD.flatMap">[docs]</a>    <span class="k">def</span> <span class="nf">flatMap</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[U]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD by first applying a function to all elements of this</span> | 
|  | <span class="sd">        RDD, and then flattening the results.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to turn a T into a sequence of U</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitions`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithIndex`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithSplit`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([2, 3, 4])</span> | 
|  | <span class="sd">        >>> sorted(rdd.flatMap(lambda x: range(1, x)).collect())</span> | 
|  | <span class="sd">        [1, 1, 1, 2, 2, 3]</span> | 
|  | <span class="sd">        >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())</span> | 
|  | <span class="sd">        [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">_</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">chain</span><span class="o">.</span><span class="n">from_iterable</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">f</span><span class="p">),</span> <span class="n">iterator</span><span class="p">))</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.mapPartitions"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.mapPartitions.html#pyspark.RDD.mapPartitions">[docs]</a>    <span class="k">def</span> <span class="nf">mapPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[U]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD by applying a function to each partition of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to run on each partition of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.flatMap`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithIndex`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithSplit`</span> | 
|  | <span class="sd">        :meth:`RDDBarrier.mapPartitions`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 2)</span> | 
|  | <span class="sd">        >>> def f(iterator): yield sum(iterator)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> rdd.mapPartitions(f).collect()</span> | 
|  | <span class="sd">        [3, 7]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">_</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.mapPartitionsWithIndex"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.mapPartitionsWithIndex.html#pyspark.RDD.mapPartitionsWithIndex">[docs]</a>    <span class="k">def</span> <span class="nf">mapPartitionsWithIndex</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> | 
|  | <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> | 
|  | <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[U]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD by applying a function to each partition of this RDD,</span> | 
|  | <span class="sd">        while tracking the index of the original partition.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to run on each partition of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.flatMap`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitions`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithSplit`</span> | 
|  | <span class="sd">        :meth:`RDDBarrier.mapPartitionsWithIndex`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)</span> | 
|  | <span class="sd">        >>> def f(splitIndex, iterator): yield splitIndex</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> rdd.mapPartitionsWithIndex(f).sum()</span> | 
|  | <span class="sd">        6</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">PipelinedRDD</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.mapPartitionsWithSplit"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.mapPartitionsWithSplit.html#pyspark.RDD.mapPartitionsWithSplit">[docs]</a>    <span class="k">def</span> <span class="nf">mapPartitionsWithSplit</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> | 
|  | <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> | 
|  | <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[U]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD by applying a function to each partition of this RDD,</span> | 
|  | <span class="sd">        while tracking the index of the original partition.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        .. deprecated:: 0.9.0</span> | 
|  | <span class="sd">            use meth:`RDD.mapPartitionsWithIndex` instead.</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to run on each partition of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.flatMap`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitions`</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithIndex`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)</span> | 
|  | <span class="sd">        >>> def f(splitIndex, iterator): yield splitIndex</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> rdd.mapPartitionsWithSplit(f).sum()</span> | 
|  | <span class="sd">        6</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> | 
|  | <span class="s2">"mapPartitionsWithSplit is deprecated; use mapPartitionsWithIndex instead"</span><span class="p">,</span> | 
|  | <span class="ne">FutureWarning</span><span class="p">,</span> | 
|  | <span class="n">stacklevel</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.getNumPartitions"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.getNumPartitions.html#pyspark.RDD.getNumPartitions">[docs]</a>    <span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns the number of partitions in RDD</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        int</span> | 
|  | <span class="sd">            number of partitions</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 2)</span> | 
|  | <span class="sd">        >>> rdd.getNumPartitions()</span> | 
|  | <span class="sd">        2</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">partitions</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.filter"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.filter.html#pyspark.RDD.filter">[docs]</a>    <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="nb">bool</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD containing only the elements that satisfy a predicate.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to run on each element of the RDD</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each element</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4, 5])</span> | 
|  | <span class="sd">        >>> rdd.filter(lambda x: x % 2 == 0).collect()</span> | 
|  | <span class="sd">        [2, 4]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="nb">filter</span><span class="p">(</span><span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">f</span><span class="p">),</span> <span class="n">iterator</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.distinct"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.distinct.html#pyspark.RDD.distinct">[docs]</a>    <span class="k">def</span> <span class="nf">distinct</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD containing the distinct elements in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` containing the distinct elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.countApproxDistinct`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())</span> | 
|  | <span class="sd">        [1, 2, 3]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="kc">None</span><span class="p">))</span> | 
|  | <span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">_</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span> | 
|  | <span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sample"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sample.html#pyspark.RDD.sample">[docs]</a>    <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">withReplacement</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">fraction</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a sampled subset of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        withReplacement : bool</span> | 
|  | <span class="sd">            can elements be sampled multiple times (replaced when sampled out)</span> | 
|  | <span class="sd">        fraction : float</span> | 
|  | <span class="sd">            expected size of the sample as a fraction of this RDD's size</span> | 
|  | <span class="sd">            without replacement: probability that each element is chosen; fraction must be [0, 1]</span> | 
|  | <span class="sd">            with replacement: expected number of times each element is chosen; fraction must be >= 0</span> | 
|  | <span class="sd">        seed : int, optional</span> | 
|  | <span class="sd">            seed for the random number generator</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` containing a sampled subset of elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.takeSample`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleByKey`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.sample`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This is not guaranteed to provide exactly the fraction specified of the total</span> | 
|  | <span class="sd">        count of the given :class:`DataFrame`.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(100), 4)</span> | 
|  | <span class="sd">        >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="n">fraction</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Fraction must be nonnegative."</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">RDDSampler</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span><span class="o">.</span><span class="n">func</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.randomSplit"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.randomSplit.html#pyspark.RDD.randomSplit">[docs]</a>    <span class="k">def</span> <span class="nf">randomSplit</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">weights</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"List[RDD[T]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Randomly splits this RDD with the provided weights.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        weights : list</span> | 
|  | <span class="sd">            weights for splits, will be normalized if they don't sum to 1</span> | 
|  | <span class="sd">        seed : int, optional</span> | 
|  | <span class="sd">            random seed</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            split :class:`RDD`\\s in a list</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.randomSplit`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(500), 1)</span> | 
|  | <span class="sd">        >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17)</span> | 
|  | <span class="sd">        >>> len(rdd1.collect() + rdd2.collect())</span> | 
|  | <span class="sd">        500</span> | 
|  | <span class="sd">        >>> 150 < rdd1.count() < 250</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> 250 < rdd2.count() < 350</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">w</span> <span class="o">>=</span> <span class="mi">0</span> <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">weights</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Weights must be nonnegative"</span><span class="p">)</span> | 
|  | <span class="n">s</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">weights</span><span class="p">))</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="n">s</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Sum of weights must be positive"</span><span class="p">)</span> | 
|  | <span class="n">cweights</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]</span> | 
|  | <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">weights</span><span class="p">:</span> | 
|  | <span class="n">cweights</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cweights</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">w</span> <span class="o">/</span> <span class="n">s</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">seed</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="o">**</span><span class="mi">32</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="p">[</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">RDDRangeSampler</span><span class="p">(</span><span class="n">lb</span><span class="p">,</span> <span class="n">ub</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span><span class="o">.</span><span class="n">func</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> | 
|  | <span class="k">for</span> <span class="n">lb</span><span class="p">,</span> <span class="n">ub</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">cweights</span><span class="p">,</span> <span class="n">cweights</span><span class="p">[</span><span class="mi">1</span><span class="p">:])</span> | 
|  | <span class="p">]</span></div> | 
|  |  | 
|  | <span class="c1"># this is ported from scala/spark/RDD.scala</span> | 
|  | <div class="viewcode-block" id="RDD.takeSample"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.takeSample.html#pyspark.RDD.takeSample">[docs]</a>    <span class="k">def</span> <span class="nf">takeSample</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">withReplacement</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a fixed-size sampled subset of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        withReplacement : bool</span> | 
|  | <span class="sd">            whether sampling is done with replacement</span> | 
|  | <span class="sd">        num : int</span> | 
|  | <span class="sd">            size of the returned sample</span> | 
|  | <span class="sd">        seed : int, optional</span> | 
|  | <span class="sd">            random seed</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            a fixed-size sampled subset of this :class:`RDD` in an array</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.sample`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting array is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import sys</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(0, 10))</span> | 
|  | <span class="sd">        >>> len(rdd.takeSample(True, 20, 1))</span> | 
|  | <span class="sd">        20</span> | 
|  | <span class="sd">        >>> len(rdd.takeSample(False, 5, 2))</span> | 
|  | <span class="sd">        5</span> | 
|  | <span class="sd">        >>> len(rdd.takeSample(False, 15, 3))</span> | 
|  | <span class="sd">        10</span> | 
|  | <span class="sd">        >>> sc.range(0, 10).takeSample(False, sys.maxsize)</span> | 
|  | <span class="sd">        Traceback (most recent call last):</span> | 
|  | <span class="sd">            ...</span> | 
|  | <span class="sd">        ValueError: Sample size cannot be greater than ...</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">numStDev</span> <span class="o">=</span> <span class="mf">10.0</span> | 
|  | <span class="n">maxSampleSize</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span> <span class="o">-</span> <span class="nb">int</span><span class="p">(</span><span class="n">numStDev</span> <span class="o">*</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">))</span> | 
|  | <span class="k">if</span> <span class="n">num</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Sample size cannot be negative."</span><span class="p">)</span> | 
|  | <span class="k">elif</span> <span class="n">num</span> <span class="o">></span> <span class="n">maxSampleSize</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Sample size cannot be greater than </span><span class="si">%d</span><span class="s2">."</span> <span class="o">%</span> <span class="n">maxSampleSize</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">num</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="p">[]</span> | 
|  |  | 
|  | <span class="n">initialCount</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="n">initialCount</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="p">[]</span> | 
|  |  | 
|  | <span class="n">rand</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">Random</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="p">(</span><span class="ow">not</span> <span class="n">withReplacement</span><span class="p">)</span> <span class="ow">and</span> <span class="n">num</span> <span class="o">>=</span> <span class="n">initialCount</span><span class="p">:</span> | 
|  | <span class="c1"># shuffle current RDD and return</span> | 
|  | <span class="n">samples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="n">rand</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">samples</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">samples</span> | 
|  |  | 
|  | <span class="n">fraction</span> <span class="o">=</span> <span class="n">RDD</span><span class="o">.</span><span class="n">_computeFractionForSampleSize</span><span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">initialCount</span><span class="p">,</span> <span class="n">withReplacement</span><span class="p">)</span> | 
|  | <span class="n">samples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  |  | 
|  | <span class="c1"># If the first sample didn't turn out large enough, keep trying to take samples;</span> | 
|  | <span class="c1"># this shouldn't happen often because we use a big multiplier for their initial size.</span> | 
|  | <span class="c1"># See: scala/spark/RDD.scala</span> | 
|  | <span class="k">while</span> <span class="nb">len</span><span class="p">(</span><span class="n">samples</span><span class="p">)</span> <span class="o"><</span> <span class="n">num</span><span class="p">:</span> | 
|  | <span class="c1"># TODO: add log warning for when more than one iteration was run</span> | 
|  | <span class="n">seed</span> <span class="o">=</span> <span class="n">rand</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">)</span> | 
|  | <span class="n">samples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">rand</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">samples</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">samples</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">num</span><span class="p">]</span></div> | 
|  |  | 
|  | <span class="nd">@staticmethod</span> | 
|  | <span class="k">def</span> <span class="nf">_computeFractionForSampleSize</span><span class="p">(</span> | 
|  | <span class="n">sampleSizeLowerBound</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">total</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">withReplacement</span><span class="p">:</span> <span class="nb">bool</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns a sampling rate that guarantees a sample of</span> | 
|  | <span class="sd">        size >= sampleSizeLowerBound 99.99% of the time.</span> | 
|  |  | 
|  | <span class="sd">        How the sampling rate is determined:</span> | 
|  | <span class="sd">        Let p = num / total, where num is the sample size and total is the</span> | 
|  | <span class="sd">        total number of data points in the RDD. We're trying to compute</span> | 
|  | <span class="sd">        q > p such that</span> | 
|  | <span class="sd">          - when sampling with replacement, we're drawing each data point</span> | 
|  | <span class="sd">            with prob_i ~ Pois(q), where we want to guarantee</span> | 
|  | <span class="sd">            Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to</span> | 
|  | <span class="sd">            total), i.e. the failure rate of not having a sufficiently large</span> | 
|  | <span class="sd">            sample < 0.0001. Setting q = p + 5 * sqrt(p/total) is sufficient</span> | 
|  | <span class="sd">            to guarantee 0.9999 success rate for num > 12, but we need a</span> | 
|  | <span class="sd">            slightly larger q (9 empirically determined).</span> | 
|  | <span class="sd">          - when sampling without replacement, we're drawing each data point</span> | 
|  | <span class="sd">            with prob_i ~ Binomial(total, fraction) and our choice of q</span> | 
|  | <span class="sd">            guarantees 1-delta, or 0.9999 success rate, where success rate is</span> | 
|  | <span class="sd">            defined the same as in sampling with replacement.</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">fraction</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">sampleSizeLowerBound</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span> | 
|  | <span class="k">if</span> <span class="n">withReplacement</span><span class="p">:</span> | 
|  | <span class="n">numStDev</span> <span class="o">=</span> <span class="mi">5</span> | 
|  | <span class="k">if</span> <span class="n">sampleSizeLowerBound</span> <span class="o"><</span> <span class="mi">12</span><span class="p">:</span> | 
|  | <span class="n">numStDev</span> <span class="o">=</span> <span class="mi">9</span> | 
|  | <span class="k">return</span> <span class="n">fraction</span> <span class="o">+</span> <span class="n">numStDev</span> <span class="o">*</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">fraction</span> <span class="o">/</span> <span class="n">total</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">delta</span> <span class="o">=</span> <span class="mf">0.00005</span> | 
|  | <span class="n">gamma</span> <span class="o">=</span> <span class="o">-</span><span class="n">log</span><span class="p">(</span><span class="n">delta</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span> | 
|  | <span class="k">return</span> <span class="nb">min</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">fraction</span> <span class="o">+</span> <span class="n">gamma</span> <span class="o">+</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">gamma</span> <span class="o">*</span> <span class="n">gamma</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">gamma</span> <span class="o">*</span> <span class="n">fraction</span><span class="p">))</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.union"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.union.html#pyspark.RDD.union">[docs]</a>    <span class="k">def</span> <span class="nf">union</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[U]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Union[T, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the union of this RDD and another one.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            the union of this :class:`RDD` and another one</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.union`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.union`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 1, 2, 3])</span> | 
|  | <span class="sd">        >>> rdd.union(rdd).collect()</span> | 
|  | <span class="sd">        [1, 1, 2, 3, 1, 1, 2, 3]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">:</span> | 
|  | <span class="n">rdd</span><span class="p">:</span> <span class="s2">"RDD[Union[T, U]]"</span> <span class="o">=</span> <span class="n">RDD</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="c1"># These RDDs contain data in different serialized formats, so we</span> | 
|  | <span class="c1"># must normalize them to the default serializer.</span> | 
|  | <span class="n">self_copy</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">()</span> | 
|  | <span class="n">other_copy</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">()</span> | 
|  | <span class="n">rdd</span> <span class="o">=</span> <span class="n">RDD</span><span class="p">(</span><span class="n">self_copy</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">other_copy</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">partitioner</span> | 
|  | <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="n">rdd</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> | 
|  | <span class="p">):</span> | 
|  | <span class="n">rdd</span><span class="o">.</span><span class="n">partitioner</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span> | 
|  | <span class="k">return</span> <span class="n">rdd</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.intersection"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.intersection.html#pyspark.RDD.intersection">[docs]</a>    <span class="k">def</span> <span class="nf">intersection</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the intersection of this RDD and another one. The output will</span> | 
|  | <span class="sd">        not contain any duplicate elements, even if the input RDDs did.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            the intersection of this :class:`RDD` and another one</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.intersect`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method performs a shuffle internally.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])</span> | 
|  | <span class="sd">        >>> rdd1.intersection(rdd2).collect()</span> | 
|  | <span class="sd">        [1, 2, 3]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">v</span><span class="p">:</span> <span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="kc">None</span><span class="p">))</span> | 
|  | <span class="o">.</span><span class="n">cogroup</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">v</span><span class="p">:</span> <span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="kc">None</span><span class="p">)))</span> | 
|  | <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">k_vs</span><span class="p">:</span> <span class="nb">all</span><span class="p">(</span><span class="n">k_vs</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> | 
|  | <span class="o">.</span><span class="n">keys</span><span class="p">()</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_reserialize</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">serializer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Serializer</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="n">serializer</span> <span class="o">=</span> <span class="n">serializer</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">serializer</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">!=</span> <span class="n">serializer</span><span class="p">:</span> | 
|  | <span class="bp">self</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">=</span> <span class="n">serializer</span> | 
|  | <span class="k">return</span> <span class="bp">self</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[U]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Union[T, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the union of this RDD and another one.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 1, 2, 3])</span> | 
|  | <span class="sd">        >>> (rdd + rdd).collect()</span> | 
|  | <span class="sd">        [1, 1, 2, 3, 1, 1, 2, 3]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">TypeError</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">repartitionAndSortWithinPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[S, V]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"S"</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[S, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">repartitionAndSortWithinPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">],</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">],</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">repartitionAndSortWithinPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="o">*</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">],</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.repartitionAndSortWithinPartitions"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.repartitionAndSortWithinPartitions.html#pyspark.RDD.repartitionAndSortWithinPartitions">[docs]</a>    <span class="k">def</span> <span class="nf">repartitionAndSortWithinPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[Any, Any]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[Any, Any]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Repartition the RDD according to the given partitioner and, within each resulting partition,</span> | 
|  | <span class="sd">        sort records by their keys.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            a function to compute the partition index</span> | 
|  | <span class="sd">        ascending : bool, optional, default True</span> | 
|  | <span class="sd">            sort the keys in ascending or descending order</span> | 
|  | <span class="sd">        keyfunc : function, optional, default identity mapping</span> | 
|  | <span class="sd">            a function to compute the key</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.repartition`</span> | 
|  | <span class="sd">        :meth:`RDD.partitionBy`</span> | 
|  | <span class="sd">        :meth:`RDD.sortBy`</span> | 
|  | <span class="sd">        :meth:`RDD.sortByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])</span> | 
|  | <span class="sd">        >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, True)</span> | 
|  | <span class="sd">        >>> rdd2.glom().collect()</span> | 
|  | <span class="sd">        [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_defaultReducePartitions</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_memory_limit</span><span class="p">()</span> | 
|  | <span class="n">serializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">sortPartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]]:</span> | 
|  | <span class="n">sort</span> <span class="o">=</span> <span class="n">ExternalSorter</span><span class="p">(</span><span class="n">memory</span> <span class="o">*</span> <span class="mf">0.9</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span><span class="o">.</span><span class="n">sorted</span> | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">(</span><span class="n">sort</span><span class="p">(</span><span class="n">iterator</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k_v</span><span class="p">:</span> <span class="n">keyfunc</span><span class="p">(</span><span class="n">k_v</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">reverse</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="n">ascending</span><span class="p">)))</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">sortPartition</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">sortByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[S, V]]"</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">sortByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">],</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">sortByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> | 
|  | <span class="o">*</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">],</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sortByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sortByKey.html#pyspark.RDD.sortByKey">[docs]</a>    <span class="k">def</span> <span class="nf">sortByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Sorts this RDD, which is assumed to consist of (key, value) pairs.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        ascending : bool, optional, default True</span> | 
|  | <span class="sd">            sort the keys in ascending or descending order</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        keyfunc : function, optional, default identity mapping</span> | 
|  | <span class="sd">            a function to compute the key</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.sortBy`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.sort`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp).sortByKey().first()</span> | 
|  | <span class="sd">        ('1', 3)</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp).sortByKey(True, 1).collect()</span> | 
|  | <span class="sd">        [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp).sortByKey(True, 2).collect()</span> | 
|  | <span class="sd">        [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]</span> | 
|  | <span class="sd">        >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]</span> | 
|  | <span class="sd">        >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()</span> | 
|  | <span class="sd">        [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_defaultReducePartitions</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_memory_limit</span><span class="p">()</span> | 
|  | <span class="n">serializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">sortPartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]]:</span> | 
|  | <span class="n">sort</span> <span class="o">=</span> <span class="n">ExternalSorter</span><span class="p">(</span><span class="n">memory</span> <span class="o">*</span> <span class="mf">0.9</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span><span class="o">.</span><span class="n">sorted</span> | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">(</span><span class="n">sort</span><span class="p">(</span><span class="n">iterator</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">kv</span><span class="p">:</span> <span class="n">keyfunc</span><span class="p">(</span><span class="n">kv</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">reverse</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="n">ascending</span><span class="p">)))</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">numPartitions</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="bp">self</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">sortPartition</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># first compute the boundary of each part via sampling: we want to partition</span> | 
|  | <span class="c1"># the key-space into bins such that the bins have roughly the same</span> | 
|  | <span class="c1"># number of (key, value) pairs falling into them</span> | 
|  | <span class="n">rddSize</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="n">rddSize</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span>  <span class="c1"># empty RDD</span> | 
|  | <span class="n">maxSampleSize</span> <span class="o">=</span> <span class="n">numPartitions</span> <span class="o">*</span> <span class="mf">20.0</span>  <span class="c1"># constant from Spark's RangePartitioner</span> | 
|  | <span class="n">fraction</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">maxSampleSize</span> <span class="o">/</span> <span class="nb">max</span><span class="p">(</span><span class="n">rddSize</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="mf">1.0</span><span class="p">)</span> | 
|  | <span class="n">samples</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="n">fraction</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">kv</span><span class="p">:</span> <span class="n">kv</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="n">samples</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">samples</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">keyfunc</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># we have numPartitions many parts but one of the them has</span> | 
|  | <span class="c1"># an implicit boundary</span> | 
|  | <span class="n">bounds</span> <span class="o">=</span> <span class="p">[</span> | 
|  | <span class="n">samples</span><span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">samples</span><span class="p">)</span> <span class="o">*</span> <span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">/</span> <span class="n">numPartitions</span><span class="p">)]</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">numPartitions</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> | 
|  | <span class="p">]</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">rangePartitioner</span><span class="p">(</span><span class="n">k</span><span class="p">:</span> <span class="n">K</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="n">p</span> <span class="o">=</span> <span class="n">bisect</span><span class="o">.</span><span class="n">bisect_left</span><span class="p">(</span><span class="n">bounds</span><span class="p">,</span> <span class="n">keyfunc</span><span class="p">(</span><span class="n">k</span><span class="p">))</span> | 
|  | <span class="k">if</span> <span class="n">ascending</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">p</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">numPartitions</span> <span class="o">-</span> <span class="mi">1</span> <span class="o">-</span> <span class="n">p</span>  <span class="c1"># type: ignore[operator]</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">rangePartitioner</span><span class="p">)</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">sortPartition</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sortBy"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sortBy.html#pyspark.RDD.sortBy">[docs]</a>    <span class="k">def</span> <span class="nf">sortBy</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> | 
|  | <span class="n">keyfunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">],</span> | 
|  | <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Sorts this RDD by the given keyfunc</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        keyfunc : function</span> | 
|  | <span class="sd">            a function to compute the key</span> | 
|  | <span class="sd">        ascending : bool, optional, default True</span> | 
|  | <span class="sd">            sort the keys in ascending or descending order</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.sortByKey`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.sort`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp).sortBy(lambda x: x[0]).collect()</span> | 
|  | <span class="sd">        [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]</span> | 
|  | <span class="sd">        >>> sc.parallelize(tmp).sortBy(lambda x: x[1]).collect()</span> | 
|  | <span class="sd">        [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">keyBy</span><span class="p">(</span><span class="n">keyfunc</span><span class="p">)</span>  <span class="c1"># type: ignore[type-var]</span> | 
|  | <span class="o">.</span><span class="n">sortByKey</span><span class="p">(</span><span class="n">ascending</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span> | 
|  | <span class="o">.</span><span class="n">values</span><span class="p">()</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.glom"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.glom.html#pyspark.RDD.glom">[docs]</a>    <span class="k">def</span> <span class="nf">glom</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[List[T]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD created by coalescing all elements within each partition</span> | 
|  | <span class="sd">        into a list.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` coalescing all elements within each partition into a list</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 2)</span> | 
|  | <span class="sd">        >>> sorted(rdd.glom().collect())</span> | 
|  | <span class="sd">        [[1, 2], [3, 4]]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span> | 
|  | <span class="k">yield</span> <span class="nb">list</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.cartesian"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.cartesian.html#pyspark.RDD.cartesian">[docs]</a>    <span class="k">def</span> <span class="nf">cartesian</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[U]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[T, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the Cartesian product of this RDD and another one, that is, the</span> | 
|  | <span class="sd">        RDD of all pairs of elements ``(a, b)`` where ``a`` is in `self` and</span> | 
|  | <span class="sd">        ``b`` is in `other`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            the Cartesian product of this :class:`RDD` and another one</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.crossJoin`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2])</span> | 
|  | <span class="sd">        >>> sorted(rdd.cartesian(rdd).collect())</span> | 
|  | <span class="sd">        [(1, 1), (1, 2), (2, 1), (2, 2)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="c1"># Due to batching, we can't use the Java cartesian method.</span> | 
|  | <span class="n">deserializer</span> <span class="o">=</span> <span class="n">CartesianDeserializer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">RDD</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">cartesian</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="n">deserializer</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.groupBy"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.groupBy.html#pyspark.RDD.groupBy">[docs]</a>    <span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> | 
|  | <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="n">K</span><span class="p">],</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Iterable[T]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD of grouped items.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to compute the key</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            a function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` of grouped items</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.groupByKey`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.groupBy`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 1, 2, 3, 5, 8])</span> | 
|  | <span class="sd">        >>> result = rdd.groupBy(lambda x: x % 2).collect()</span> | 
|  | <span class="sd">        >>> sorted([(x, sorted(y)) for (x, y) in result])</span> | 
|  | <span class="sd">        [(0, [2, 8]), (1, [1, 1, 3, 5])]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">f</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">groupByKey</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.pipe"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.pipe.html#pyspark.RDD.pipe">[docs]</a>    <span class="k">def</span> <span class="nf">pipe</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">,</span> <span class="n">command</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">env</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">checkCode</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[str]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD created by piping elements to a forked external process.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        command : str</span> | 
|  | <span class="sd">            command to run.</span> | 
|  | <span class="sd">        env : dict, optional</span> | 
|  | <span class="sd">            environment variables to set.</span> | 
|  | <span class="sd">        checkCode : bool, optional</span> | 
|  | <span class="sd">            whether to check the return value of the shell command.</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` of strings</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()</span> | 
|  | <span class="sd">        ['1', '2', '', '3']</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">env</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">env</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> | 
|  | <span class="n">pipe</span> <span class="o">=</span> <span class="n">Popen</span><span class="p">(</span><span class="n">shlex</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">command</span><span class="p">),</span> <span class="n">env</span><span class="o">=</span><span class="n">env</span><span class="p">,</span> <span class="n">stdin</span><span class="o">=</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">stdout</span><span class="o">=</span><span class="n">PIPE</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">pipe_objs</span><span class="p">(</span><span class="n">out</span><span class="p">:</span> <span class="n">IO</span><span class="p">[</span><span class="nb">bytes</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">s</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span><span class="o">.</span><span class="n">rstrip</span><span class="p">(</span><span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span> <span class="o">+</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span> | 
|  | <span class="n">out</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">s</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">))</span> | 
|  | <span class="n">out</span><span class="o">.</span><span class="n">close</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">Thread</span><span class="p">(</span><span class="n">target</span><span class="o">=</span><span class="n">pipe_objs</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="p">[</span><span class="n">pipe</span><span class="o">.</span><span class="n">stdin</span><span class="p">])</span><span class="o">.</span><span class="n">start</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">check_return_code</span><span class="p">()</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> | 
|  | <span class="n">pipe</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="n">checkCode</span> <span class="ow">and</span> <span class="n">pipe</span><span class="o">.</span><span class="n">returncode</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> | 
|  | <span class="n">error_class</span><span class="o">=</span><span class="s2">"PIPE_FUNCTION_EXITED"</span><span class="p">,</span> | 
|  | <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span> | 
|  | <span class="s2">"func_name"</span><span class="p">:</span> <span class="n">command</span><span class="p">,</span> | 
|  | <span class="s2">"error_code"</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="n">pipe</span><span class="o">.</span><span class="n">returncode</span><span class="p">),</span> | 
|  | <span class="p">},</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">):</span> | 
|  | <span class="k">yield</span> <span class="n">i</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="n">cast</span><span class="p">(</span><span class="nb">bytes</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">rstrip</span><span class="p">(</span><span class="sa">b</span><span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span> | 
|  | <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">chain</span><span class="p">(</span> | 
|  | <span class="nb">iter</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">IO</span><span class="p">[</span><span class="nb">bytes</span><span class="p">],</span> <span class="n">pipe</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span><span class="o">.</span><span class="n">readline</span><span class="p">,</span> <span class="sa">b</span><span class="s2">""</span><span class="p">),</span> <span class="n">check_return_code</span><span class="p">()</span> | 
|  | <span class="p">)</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.foreach"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.foreach.html#pyspark.RDD.foreach">[docs]</a>    <span class="k">def</span> <span class="nf">foreach</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Applies a function to all elements of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function applied to each element</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.foreachPartition`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.foreach`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.foreachPartition`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> def f(x): print(x)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f)</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">f</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">processPartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">([])</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">processPartition</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>  <span class="c1"># Force evaluation</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.foreachPartition"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.foreachPartition.html#pyspark.RDD.foreachPartition">[docs]</a>    <span class="k">def</span> <span class="nf">foreachPartition</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Applies a function to each partition of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function applied to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.foreach`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.foreach`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.foreachPartition`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> def f(iterator):</span> | 
|  | <span class="sd">        ...     for x in iterator:</span> | 
|  | <span class="sd">        ...          print(x)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">it</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="n">r</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="n">it</span><span class="p">)</span> | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">(</span><span class="n">r</span><span class="p">)</span>  <span class="c1"># type: ignore[call-overload]</span> | 
|  | <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="nb">iter</span><span class="p">([])</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>  <span class="c1"># Force evaluation</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.collect"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.collect.html#pyspark.RDD.collect">[docs]</a>    <span class="k">def</span> <span class="nf">collect</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a list that contains all the elements in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            a list containing all the elements</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting array is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.toLocalIterator`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.collect`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.range(5).collect()</span> | 
|  | <span class="sd">        [0, 1, 2, 3, 4]</span> | 
|  | <span class="sd">        >>> sc.parallelize(["x", "y", "z"]).collect()</span> | 
|  | <span class="sd">        ['x', 'y', 'z']</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">context</span><span class="p">):</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">collectAndServe</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">())</span> | 
|  | <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">))</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.collectWithJobGroup"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.collectWithJobGroup.html#pyspark.RDD.collectWithJobGroup">[docs]</a>    <span class="k">def</span> <span class="nf">collectWithJobGroup</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">groupId</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">description</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">interruptOnCancel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"List[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        When collect rdd, use this method to specify job group.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 3.0.0</span> | 
|  |  | 
|  | <span class="sd">        .. deprecated:: 3.1.0</span> | 
|  | <span class="sd">            Use :class:`pyspark.InheritableThread` with the pinned thread mode enabled.</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        groupId : str</span> | 
|  | <span class="sd">            The group ID to assign.</span> | 
|  | <span class="sd">        description : str</span> | 
|  | <span class="sd">            The description to set for the job group.</span> | 
|  | <span class="sd">        interruptOnCancel : bool, optional, default False</span> | 
|  | <span class="sd">            whether to interrupt jobs on job cancellation.</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            a list containing all the elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.collect`</span> | 
|  | <span class="sd">        :meth:`SparkContext.setJobGroup`</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> | 
|  | <span class="s2">"Deprecated in 3.1, Use pyspark.InheritableThread with "</span> | 
|  | <span class="s2">"the pinned thread mode enabled."</span><span class="p">,</span> | 
|  | <span class="ne">FutureWarning</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">context</span><span class="p">):</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">collectAndServeWithJobGroup</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">(),</span> <span class="n">groupId</span><span class="p">,</span> <span class="n">description</span><span class="p">,</span> <span class="n">interruptOnCancel</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="n">_load_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">))</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.reduce"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.reduce.html#pyspark.RDD.reduce">[docs]</a>    <span class="k">def</span> <span class="nf">reduce</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">,</span> <span class="n">T</span><span class="p">],</span> <span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Reduces the elements of this RDD using the specified commutative and</span> | 
|  | <span class="sd">        associative binary operator. Currently reduces partitions locally.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            the reduce function</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the aggregated result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.treeReduce`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregate`</span> | 
|  | <span class="sd">        :meth:`RDD.treeAggregate`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> from operator import add</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add)</span> | 
|  | <span class="sd">        15</span> | 
|  | <span class="sd">        >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add)</span> | 
|  | <span class="sd">        10</span> | 
|  | <span class="sd">        >>> sc.parallelize([]).reduce(add)</span> | 
|  | <span class="sd">        Traceback (most recent call last):</span> | 
|  | <span class="sd">            ...</span> | 
|  | <span class="sd">        ValueError: Can not reduce() empty RDD</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">f</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="n">iterator</span> <span class="o">=</span> <span class="nb">iter</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="n">initial</span> <span class="o">=</span> <span class="nb">next</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">except</span> <span class="ne">StopIteration</span><span class="p">:</span> | 
|  | <span class="k">return</span> | 
|  | <span class="k">yield</span> <span class="n">reduce</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">iterator</span><span class="p">,</span> <span class="n">initial</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">vals</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="n">vals</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">reduce</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">vals</span><span class="p">)</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Can not reduce() empty RDD"</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.treeReduce"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.treeReduce.html#pyspark.RDD.treeReduce">[docs]</a>    <span class="k">def</span> <span class="nf">treeReduce</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">,</span> <span class="n">T</span><span class="p">],</span> <span class="n">T</span><span class="p">],</span> <span class="n">depth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">)</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Reduces the elements of this RDD in a multi-level tree pattern.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            the reduce function</span> | 
|  | <span class="sd">        depth : int, optional, default 2</span> | 
|  | <span class="sd">            suggested depth of the tree (default: 2)</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the aggregated result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduce`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregate`</span> | 
|  | <span class="sd">        :meth:`RDD.treeAggregate`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> add = lambda x, y: x + y</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)</span> | 
|  | <span class="sd">        >>> rdd.treeReduce(add)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeReduce(add, 1)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeReduce(add, 2)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeReduce(add, 5)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeReduce(add, 10)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">depth</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Depth cannot be smaller than 1 but got </span><span class="si">%d</span><span class="s2">."</span> <span class="o">%</span> <span class="n">depth</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># Use the second entry to indicate whether this is a dummy value.</span> | 
|  | <span class="n">zeroValue</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>  <span class="c1"># type: ignore[assignment]</span> | 
|  | <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">op</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span> <span class="n">y</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">bool</span><span class="p">])</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">bool</span><span class="p">]:</span> | 
|  | <span class="k">if</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">y</span> | 
|  | <span class="k">elif</span> <span class="n">y</span><span class="p">[</span><span class="mi">1</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">x</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">y</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="kc">False</span> | 
|  |  | 
|  | <span class="n">reduced</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="kc">False</span><span class="p">))</span><span class="o">.</span><span class="n">treeAggregate</span><span class="p">(</span><span class="n">zeroValue</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">depth</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">reduced</span><span class="p">[</span><span class="mi">1</span><span class="p">]:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Cannot reduce empty RDD."</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">reduced</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.fold"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.fold.html#pyspark.RDD.fold">[docs]</a>    <span class="k">def</span> <span class="nf">fold</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">zeroValue</span><span class="p">:</span> <span class="n">T</span><span class="p">,</span> <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">,</span> <span class="n">T</span><span class="p">],</span> <span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Aggregate the elements of each partition, and then the results for all</span> | 
|  | <span class="sd">        the partitions, using a given associative function and a neutral "zero value."</span> | 
|  |  | 
|  | <span class="sd">        The function ``op(t1, t2)`` is allowed to modify ``t1`` and return it</span> | 
|  | <span class="sd">        as its result value to avoid object allocation; however, it should not</span> | 
|  | <span class="sd">        modify ``t2``.</span> | 
|  |  | 
|  | <span class="sd">        This behaves somewhat differently from fold operations implemented</span> | 
|  | <span class="sd">        for non-distributed collections in functional languages like Scala.</span> | 
|  | <span class="sd">        This fold operation may be applied to partitions individually, and then</span> | 
|  | <span class="sd">        fold those results into the final result, rather than apply the fold</span> | 
|  | <span class="sd">        to each element sequentially in some defined ordering. For functions</span> | 
|  | <span class="sd">        that are not commutative, the result may differ from that of a fold</span> | 
|  | <span class="sd">        applied to a non-distributed collection.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        zeroValue : T</span> | 
|  | <span class="sd">            the initial value for the accumulated result of each partition</span> | 
|  | <span class="sd">        op : function</span> | 
|  | <span class="sd">            a function used to both accumulate results within a partition and combine</span> | 
|  | <span class="sd">            results from different partitions</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the aggregated result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduce`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregate`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> from operator import add</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)</span> | 
|  | <span class="sd">        15</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">op</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">zeroValue</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="n">acc</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> | 
|  | <span class="k">yield</span> <span class="n">acc</span> | 
|  |  | 
|  | <span class="c1"># collecting result of mapPartitions here ensures that the copy of</span> | 
|  | <span class="c1"># zeroValue provided to each partition is unique from the one provided</span> | 
|  | <span class="c1"># to the final reduce call</span> | 
|  | <span class="n">vals</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">reduce</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">vals</span><span class="p">,</span> <span class="n">zeroValue</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.aggregate"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.aggregate.html#pyspark.RDD.aggregate">[docs]</a>    <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">zeroValue</span><span class="p">:</span> <span class="n">U</span><span class="p">,</span> <span class="n">seqOp</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">T</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> <span class="n">combOp</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">U</span><span class="p">],</span> <span class="n">U</span><span class="p">]</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">U</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Aggregate the elements of each partition, and then the results for all</span> | 
|  | <span class="sd">        the partitions, using a given combine functions and a neutral "zero</span> | 
|  | <span class="sd">        value."</span> | 
|  |  | 
|  | <span class="sd">        The functions ``op(t1, t2)`` is allowed to modify ``t1`` and return it</span> | 
|  | <span class="sd">        as its result value to avoid object allocation; however, it should not</span> | 
|  | <span class="sd">        modify ``t2``.</span> | 
|  |  | 
|  | <span class="sd">        The first function (seqOp) can return a different result type, U, than</span> | 
|  | <span class="sd">        the type of this RDD. Thus, we need one operation for merging a T into</span> | 
|  | <span class="sd">        an U and one operation for merging two U</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        zeroValue : U</span> | 
|  | <span class="sd">            the initial value for the accumulated result of each partition</span> | 
|  | <span class="sd">        seqOp : function</span> | 
|  | <span class="sd">            a function used to accumulate results within a partition</span> | 
|  | <span class="sd">        combOp : function</span> | 
|  | <span class="sd">            an associative function used to combine results from different partitions</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        U</span> | 
|  | <span class="sd">            the aggregated result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduce`</span> | 
|  | <span class="sd">        :meth:`RDD.fold`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> seqOp = (lambda x, y: (x[0] + y, x[1] + 1))</span> | 
|  | <span class="sd">        >>> combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)</span> | 
|  | <span class="sd">        (10, 4)</span> | 
|  | <span class="sd">        >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp)</span> | 
|  | <span class="sd">        (0, 0)</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">seqOp</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">seqOp</span><span class="p">)</span> | 
|  | <span class="n">combOp</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">combOp</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">zeroValue</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">seqOp</span><span class="p">(</span><span class="n">acc</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> | 
|  | <span class="k">yield</span> <span class="n">acc</span> | 
|  |  | 
|  | <span class="c1"># collecting result of mapPartitions here ensures that the copy of</span> | 
|  | <span class="c1"># zeroValue provided to each partition is unique from the one provided</span> | 
|  | <span class="c1"># to the final reduce call</span> | 
|  | <span class="n">vals</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">reduce</span><span class="p">(</span><span class="n">combOp</span><span class="p">,</span> <span class="n">vals</span><span class="p">,</span> <span class="n">zeroValue</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.treeAggregate"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.treeAggregate.html#pyspark.RDD.treeAggregate">[docs]</a>    <span class="k">def</span> <span class="nf">treeAggregate</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> | 
|  | <span class="n">zeroValue</span><span class="p">:</span> <span class="n">U</span><span class="p">,</span> | 
|  | <span class="n">seqOp</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">T</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">combOp</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">U</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">depth</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">U</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Aggregates the elements of this RDD in a multi-level tree</span> | 
|  | <span class="sd">        pattern.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        zeroValue : U</span> | 
|  | <span class="sd">            the initial value for the accumulated result of each partition</span> | 
|  | <span class="sd">        seqOp : function</span> | 
|  | <span class="sd">            a function used to accumulate results within a partition</span> | 
|  | <span class="sd">        combOp : function</span> | 
|  | <span class="sd">            an associative function used to combine results from different partitions</span> | 
|  | <span class="sd">        depth : int, optional, default 2</span> | 
|  | <span class="sd">            suggested depth of the tree</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        U</span> | 
|  | <span class="sd">            the aggregated result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.aggregate`</span> | 
|  | <span class="sd">        :meth:`RDD.treeReduce`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> add = lambda x, y: x + y</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)</span> | 
|  | <span class="sd">        >>> rdd.treeAggregate(0, add, add)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeAggregate(0, add, add, 1)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeAggregate(0, add, add, 2)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeAggregate(0, add, add, 5)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        >>> rdd.treeAggregate(0, add, add, 10)</span> | 
|  | <span class="sd">        -5</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">depth</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Depth cannot be smaller than 1 but got </span><span class="si">%d</span><span class="s2">."</span> <span class="o">%</span> <span class="n">depth</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">zeroValue</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">aggregatePartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">zeroValue</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">acc</span> <span class="o">=</span> <span class="n">seqOp</span><span class="p">(</span><span class="n">acc</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> | 
|  | <span class="k">yield</span> <span class="n">acc</span> | 
|  |  | 
|  | <span class="n">partiallyAggregated</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">aggregatePartition</span><span class="p">)</span> | 
|  | <span class="n">numPartitions</span> <span class="o">=</span> <span class="n">partiallyAggregated</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> | 
|  | <span class="n">scale</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">ceil</span><span class="p">(</span><span class="nb">pow</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="n">depth</span><span class="p">))),</span> <span class="mi">2</span><span class="p">)</span> | 
|  | <span class="c1"># If creating an extra level doesn't help reduce the wall-clock time, we stop the tree</span> | 
|  | <span class="c1"># aggregation.</span> | 
|  | <span class="k">while</span> <span class="n">numPartitions</span> <span class="o">></span> <span class="n">scale</span> <span class="o">+</span> <span class="n">numPartitions</span> <span class="o">/</span> <span class="n">scale</span><span class="p">:</span> | 
|  | <span class="n">numPartitions</span> <span class="o">/=</span> <span class="n">scale</span>  <span class="c1"># type: ignore[assignment]</span> | 
|  | <span class="n">curNumPartitions</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mapPartition</span><span class="p">(</span><span class="n">i</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">U</span><span class="p">]]:</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="k">yield</span> <span class="p">(</span><span class="n">i</span> <span class="o">%</span> <span class="n">curNumPartitions</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">partiallyAggregated</span> <span class="o">=</span> <span class="p">(</span> | 
|  | <span class="n">partiallyAggregated</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">mapPartition</span><span class="p">)</span> | 
|  | <span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="n">combOp</span><span class="p">,</span> <span class="n">curNumPartitions</span><span class="p">)</span> | 
|  | <span class="o">.</span><span class="n">values</span><span class="p">()</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">partiallyAggregated</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">combOp</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[S]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"S"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.max"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.max.html#pyspark.RDD.max">[docs]</a>    <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Find the maximum item in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        key : function, optional</span> | 
|  | <span class="sd">            A function used to generate key for comparing</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the maximum item</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.min`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0])</span> | 
|  | <span class="sd">        >>> rdd.max()</span> | 
|  | <span class="sd">        43.0</span> | 
|  | <span class="sd">        >>> rdd.max(key=str)</span> | 
|  | <span class="sd">        5.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">key</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="nb">max</span><span class="p">)</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">:</span> <span class="nb">max</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">key</span><span class="p">))</span>  <span class="c1"># type: ignore[arg-type]</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[S]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"S"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.min"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.min.html#pyspark.RDD.min">[docs]</a>    <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Find the minimum item in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        key : function, optional</span> | 
|  | <span class="sd">            A function used to generate key for comparing</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the minimum item</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.max`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])</span> | 
|  | <span class="sd">        >>> rdd.min()</span> | 
|  | <span class="sd">        2.0</span> | 
|  | <span class="sd">        >>> rdd.min(key=str)</span> | 
|  | <span class="sd">        10.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">key</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="nb">min</span><span class="p">)</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">:</span> <span class="nb">min</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">key</span><span class="p">))</span>  <span class="c1"># type: ignore[arg-type]</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sum"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sum.html#pyspark.RDD.sum">[docs]</a>    <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"NumberOrArray"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Add up the elements in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float, int, or complex</span> | 
|  | <span class="sd">            the sum of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.mean`</span> | 
|  | <span class="sd">        :meth:`RDD.sumApprox`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1.0, 2.0, 3.0]).sum()</span> | 
|  | <span class="sd">        6.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">[</span><span class="nb">sum</span><span class="p">(</span><span class="n">x</span><span class="p">)])</span><span class="o">.</span><span class="n">fold</span><span class="p">(</span>  <span class="c1"># type: ignore[return-value]</span> | 
|  | <span class="mi">0</span><span class="p">,</span> <span class="n">operator</span><span class="o">.</span><span class="n">add</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.count"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.count.html#pyspark.RDD.count">[docs]</a>    <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the number of elements in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        int</span> | 
|  | <span class="sd">            the number of elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.countApprox`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.count`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([2, 3, 4]).count()</span> | 
|  | <span class="sd">        3</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="p">[</span><span class="nb">sum</span><span class="p">(</span><span class="mi">1</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">i</span><span class="p">)])</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.stats"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.stats.html#pyspark.RDD.stats">[docs]</a>    <span class="k">def</span> <span class="nf">stats</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">StatCounter</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a :class:`StatCounter` object that captures the mean, variance</span> | 
|  | <span class="sd">        and count of the RDD's elements in one operation.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`StatCounter`</span> | 
|  | <span class="sd">            a :class:`StatCounter` capturing the mean, variance and count of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stdev`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleStdev`</span> | 
|  | <span class="sd">        :meth:`RDD.variance`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleVariance`</span> | 
|  | <span class="sd">        :meth:`RDD.histogram`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.stat`</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">redFunc</span><span class="p">(</span><span class="n">left_counter</span><span class="p">:</span> <span class="n">StatCounter</span><span class="p">,</span> <span class="n">right_counter</span><span class="p">:</span> <span class="n">StatCounter</span><span class="p">)</span> <span class="o">-></span> <span class="n">StatCounter</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">left_counter</span><span class="o">.</span><span class="n">mergeStats</span><span class="p">(</span><span class="n">right_counter</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="p">[</span><span class="n">StatCounter</span><span class="p">(</span><span class="n">i</span><span class="p">)])</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="n">redFunc</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.histogram"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.histogram.html#pyspark.RDD.histogram">[docs]</a>    <span class="k">def</span> <span class="nf">histogram</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[S]"</span><span class="p">,</span> <span class="n">buckets</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"S"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"S"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="s2">"S"</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute a histogram using the provided buckets. The buckets</span> | 
|  | <span class="sd">        are all open to the right except for the last which is closed.</span> | 
|  | <span class="sd">        e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],</span> | 
|  | <span class="sd">        which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1</span> | 
|  | <span class="sd">        and 50 we would have a histogram of 1,0,1.</span> | 
|  |  | 
|  | <span class="sd">        If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),</span> | 
|  | <span class="sd">        this can be switched from an O(log n) insertion to O(1) per</span> | 
|  | <span class="sd">        element (where n is the number of buckets).</span> | 
|  |  | 
|  | <span class="sd">        Buckets must be sorted, not contain any duplicates, and have</span> | 
|  | <span class="sd">        at least two elements.</span> | 
|  |  | 
|  | <span class="sd">        If `buckets` is a number, it will generate buckets which are</span> | 
|  | <span class="sd">        evenly spaced between the minimum and maximum of the RDD. For</span> | 
|  | <span class="sd">        example, if the min value is 0 and the max is 100, given `buckets`</span> | 
|  | <span class="sd">        as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must</span> | 
|  | <span class="sd">        be at least 1. An exception is raised if the RDD contains infinity.</span> | 
|  | <span class="sd">        If the elements in the RDD do not vary (max == min), a single bucket</span> | 
|  | <span class="sd">        will be used.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        buckets : int, or list, or tuple</span> | 
|  | <span class="sd">            if `buckets` is a number, it computes a histogram of the data using</span> | 
|  | <span class="sd">            `buckets` number of buckets evenly, otherwise, `buckets` is the provided</span> | 
|  | <span class="sd">            buckets to bin the data.</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        tuple</span> | 
|  | <span class="sd">            a tuple of buckets and histogram</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(51))</span> | 
|  | <span class="sd">        >>> rdd.histogram(2)</span> | 
|  | <span class="sd">        ([0, 25, 50], [25, 26])</span> | 
|  | <span class="sd">        >>> rdd.histogram([0, 5, 25, 50])</span> | 
|  | <span class="sd">        ([0, 5, 25, 50], [5, 20, 26])</span> | 
|  | <span class="sd">        >>> rdd.histogram([0, 15, 30, 45, 60])  # evenly spaced buckets</span> | 
|  | <span class="sd">        ([0, 15, 30, 45, 60], [15, 15, 15, 6])</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(["ab", "ac", "b", "bd", "ef"])</span> | 
|  | <span class="sd">        >>> rdd.histogram(("a", "b", "c"))</span> | 
|  | <span class="sd">        (('a', 'b', 'c'), [2, 2])</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">buckets</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> | 
|  | <span class="k">if</span> <span class="n">buckets</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"number of buckets must be >= 1"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># filter out non-comparable elements</span> | 
|  | <span class="k">def</span> <span class="nf">comparable</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="kc">False</span> | 
|  | <span class="k">if</span> <span class="nb">type</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="ow">is</span> <span class="nb">float</span> <span class="ow">and</span> <span class="n">isnan</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> | 
|  | <span class="k">return</span> <span class="kc">False</span> | 
|  | <span class="k">return</span> <span class="kc">True</span> | 
|  |  | 
|  | <span class="n">filtered</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">comparable</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># faster than stats()</span> | 
|  | <span class="k">def</span> <span class="nf">minmax</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"S"</span><span class="p">,</span> <span class="s2">"S"</span><span class="p">],</span> <span class="n">b</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"S"</span><span class="p">,</span> <span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"S"</span><span class="p">,</span> <span class="s2">"S"</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="nb">min</span><span class="p">(</span><span class="n">a</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">b</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="nb">max</span><span class="p">(</span><span class="n">a</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">b</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> | 
|  |  | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="n">minv</span><span class="p">,</span> <span class="n">maxv</span> <span class="o">=</span> <span class="n">filtered</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">x</span><span class="p">))</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">minmax</span><span class="p">)</span> | 
|  | <span class="k">except</span> <span class="ne">TypeError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="s2">" empty "</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">e</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"can not generate buckets from empty RDD"</span><span class="p">)</span> | 
|  | <span class="k">raise</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">minv</span> <span class="o">==</span> <span class="n">maxv</span> <span class="ow">or</span> <span class="n">buckets</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="p">[</span><span class="n">minv</span><span class="p">,</span> <span class="n">maxv</span><span class="p">],</span> <span class="p">[</span><span class="n">filtered</span><span class="o">.</span><span class="n">count</span><span class="p">()]</span> | 
|  |  | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="n">inc</span> <span class="o">=</span> <span class="p">(</span><span class="n">maxv</span> <span class="o">-</span> <span class="n">minv</span><span class="p">)</span> <span class="o">/</span> <span class="n">buckets</span>  <span class="c1"># type: ignore[operator]</span> | 
|  | <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Can not generate buckets with non-number in RDD"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">isinf</span><span class="p">(</span><span class="n">inc</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Can not generate buckets with infinite value"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># keep them as integer if possible</span> | 
|  | <span class="n">inc</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">inc</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">inc</span> <span class="o">*</span> <span class="n">buckets</span> <span class="o">!=</span> <span class="n">maxv</span> <span class="o">-</span> <span class="n">minv</span><span class="p">:</span>  <span class="c1"># type: ignore[operator]</span> | 
|  | <span class="n">inc</span> <span class="o">=</span> <span class="p">(</span><span class="n">maxv</span> <span class="o">-</span> <span class="n">minv</span><span class="p">)</span> <span class="o">*</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="n">buckets</span>  <span class="c1"># type: ignore[operator]</span> | 
|  |  | 
|  | <span class="n">buckets</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="o">*</span> <span class="n">inc</span> <span class="o">+</span> <span class="n">minv</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">buckets</span><span class="p">)]</span> | 
|  | <span class="n">buckets</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">maxv</span><span class="p">)</span>  <span class="c1"># fix accumulated error</span> | 
|  | <span class="n">even</span> <span class="o">=</span> <span class="kc">True</span> | 
|  |  | 
|  | <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">buckets</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> | 
|  | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">)</span> <span class="o"><</span> <span class="mi">2</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"buckets should have more than one value"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">i</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> <span class="ow">and</span> <span class="n">isnan</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">buckets</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"can not have None or NaN in buckets"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">buckets</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">list</span><span class="p">(</span><span class="n">buckets</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"buckets should be sorted"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">buckets</span><span class="p">))</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">):</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"buckets should not contain duplicated values"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">minv</span> <span class="o">=</span> <span class="n">buckets</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> | 
|  | <span class="n">maxv</span> <span class="o">=</span> <span class="n">buckets</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> | 
|  | <span class="n">even</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="n">inc</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="n">steps</span> <span class="o">=</span> <span class="p">[</span> | 
|  | <span class="n">buckets</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="n">buckets</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>  <span class="c1"># type: ignore[operator]</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> | 
|  | <span class="p">]</span> | 
|  | <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> | 
|  | <span class="k">pass</span>  <span class="c1"># objects in buckets do not support '-'</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="nb">max</span><span class="p">(</span><span class="n">steps</span><span class="p">)</span> <span class="o">-</span> <span class="nb">min</span><span class="p">(</span><span class="n">steps</span><span class="p">)</span> <span class="o"><</span> <span class="mf">1e-10</span><span class="p">:</span>  <span class="c1"># handle precision errors</span> | 
|  | <span class="n">even</span> <span class="o">=</span> <span class="kc">True</span> | 
|  | <span class="n">inc</span> <span class="o">=</span> <span class="p">(</span><span class="n">maxv</span> <span class="o">-</span> <span class="n">minv</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>  <span class="c1"># type: ignore[operator]</span> | 
|  |  | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"buckets should be a list or tuple or number(int or long)"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">histogram</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span> | 
|  | <span class="n">counters</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">)</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="n">i</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span> <span class="ow">and</span> <span class="n">isnan</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> <span class="ow">or</span> <span class="n">i</span> <span class="o">></span> <span class="n">maxv</span> <span class="ow">or</span> <span class="n">i</span> <span class="o"><</span> <span class="n">minv</span><span class="p">:</span> | 
|  | <span class="k">continue</span> | 
|  | <span class="n">t</span> <span class="o">=</span> <span class="p">(</span> | 
|  | <span class="nb">int</span><span class="p">((</span><span class="n">i</span> <span class="o">-</span> <span class="n">minv</span><span class="p">)</span> <span class="o">/</span> <span class="n">inc</span><span class="p">)</span>  <span class="c1"># type: ignore[operator]</span> | 
|  | <span class="k">if</span> <span class="n">even</span> | 
|  | <span class="k">else</span> <span class="n">bisect</span><span class="o">.</span><span class="n">bisect_right</span><span class="p">(</span><span class="n">buckets</span><span class="p">,</span> <span class="n">i</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="p">)</span> | 
|  | <span class="n">counters</span><span class="p">[</span><span class="n">t</span><span class="p">]</span> <span class="o">+=</span> <span class="mi">1</span> | 
|  | <span class="c1"># add last two together</span> | 
|  | <span class="n">last</span> <span class="o">=</span> <span class="n">counters</span><span class="o">.</span><span class="n">pop</span><span class="p">()</span> | 
|  | <span class="n">counters</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">+=</span> <span class="n">last</span> | 
|  | <span class="k">return</span> <span class="p">[</span><span class="n">counters</span><span class="p">]</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mergeCounters</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">b</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="n">j</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)]</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">buckets</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">histogram</span><span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">mergeCounters</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.mean"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.mean.html#pyspark.RDD.mean">[docs]</a>    <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute the mean of this RDD's elements.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float</span> | 
|  | <span class="sd">            the mean of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  | <span class="sd">        :meth:`RDD.sum`</span> | 
|  | <span class="sd">        :meth:`RDD.meanApprox`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3]).mean()</span> | 
|  | <span class="sd">        2.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.variance"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.variance.html#pyspark.RDD.variance">[docs]</a>    <span class="k">def</span> <span class="nf">variance</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute the variance of this RDD's elements.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float</span> | 
|  | <span class="sd">            the variance of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleVariance`</span> | 
|  | <span class="sd">        :meth:`RDD.stdev`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleStdev`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3]).variance()</span> | 
|  | <span class="sd">        0.666...</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span><span class="o">.</span><span class="n">variance</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.stdev"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.stdev.html#pyspark.RDD.stdev">[docs]</a>    <span class="k">def</span> <span class="nf">stdev</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute the standard deviation of this RDD's elements.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float</span> | 
|  | <span class="sd">            the standard deviation of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleStdev`</span> | 
|  | <span class="sd">        :meth:`RDD.variance`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleVariance`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3]).stdev()</span> | 
|  | <span class="sd">        0.816...</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span><span class="o">.</span><span class="n">stdev</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sampleStdev"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sampleStdev.html#pyspark.RDD.sampleStdev">[docs]</a>    <span class="k">def</span> <span class="nf">sampleStdev</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute the sample standard deviation of this RDD's elements (which</span> | 
|  | <span class="sd">        corrects for bias in estimating the standard deviation by dividing by</span> | 
|  | <span class="sd">        N-1 instead of N).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float</span> | 
|  | <span class="sd">            the sample standard deviation of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  | <span class="sd">        :meth:`RDD.stdev`</span> | 
|  | <span class="sd">        :meth:`RDD.variance`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleVariance`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3]).sampleStdev()</span> | 
|  | <span class="sd">        1.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span><span class="o">.</span><span class="n">sampleStdev</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sampleVariance"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sampleVariance.html#pyspark.RDD.sampleVariance">[docs]</a>    <span class="k">def</span> <span class="nf">sampleVariance</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[NumberOrArray]"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Compute the sample variance of this RDD's elements (which corrects</span> | 
|  | <span class="sd">        for bias in estimating the variance by dividing by N-1 instead of N).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        float</span> | 
|  | <span class="sd">            the sample variance of all elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.stats`</span> | 
|  | <span class="sd">        :meth:`RDD.variance`</span> | 
|  | <span class="sd">        :meth:`RDD.stdev`</span> | 
|  | <span class="sd">        :meth:`RDD.sampleStdev`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3]).sampleVariance()</span> | 
|  | <span class="sd">        1.0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span><span class="o">.</span><span class="n">sampleVariance</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.countByValue"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.countByValue.html#pyspark.RDD.countByValue">[docs]</a>    <span class="k">def</span> <span class="nf">countByValue</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[K]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the count of each unique value in this RDD as a dictionary of</span> | 
|  | <span class="sd">        (value, count) pairs.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        dict</span> | 
|  | <span class="sd">            a dictionary of (value, count) pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.collectAsMap`</span> | 
|  | <span class="sd">        :meth:`RDD.countByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())</span> | 
|  | <span class="sd">        [(1, 2), (2, 3)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">countPartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">K</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]:</span> | 
|  | <span class="n">counts</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span> | 
|  | <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">counts</span><span class="p">[</span><span class="n">obj</span><span class="p">]</span> <span class="o">+=</span> <span class="mi">1</span> | 
|  | <span class="k">yield</span> <span class="n">counts</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mergeMaps</span><span class="p">(</span><span class="n">m1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">m2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span> | 
|  | <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">m2</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | 
|  | <span class="n">m1</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">+=</span> <span class="n">v</span> | 
|  | <span class="k">return</span> <span class="n">m1</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">countPartition</span><span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">mergeMaps</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">top</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[S]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="s2">"S"</span><span class="p">]:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">top</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.top"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.top.html#pyspark.RDD.top">[docs]</a>    <span class="k">def</span> <span class="nf">top</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Get the top N elements from an RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        num : int</span> | 
|  | <span class="sd">            top N</span> | 
|  | <span class="sd">        key : function, optional</span> | 
|  | <span class="sd">            a function used to generate key for comparing</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            the top N elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.takeOrdered`</span> | 
|  | <span class="sd">        :meth:`RDD.max`</span> | 
|  | <span class="sd">        :meth:`RDD.min`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting array is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        It returns the list sorted in descending order.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)</span> | 
|  | <span class="sd">        [12]</span> | 
|  | <span class="sd">        >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)</span> | 
|  | <span class="sd">        [6, 5]</span> | 
|  | <span class="sd">        >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)</span> | 
|  | <span class="sd">        [4, 3, 2]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">topIterator</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span> | 
|  | <span class="k">yield</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">iterator</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">key</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">merge</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">],</span> <span class="n">b</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">key</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">topIterator</span><span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">merge</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">takeOrdered</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[S]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="s2">"S"</span><span class="p">]:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">takeOrdered</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.takeOrdered"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.takeOrdered.html#pyspark.RDD.takeOrdered">[docs]</a>    <span class="k">def</span> <span class="nf">takeOrdered</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="s2">"S"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Get the N elements from an RDD ordered in ascending order or as</span> | 
|  | <span class="sd">        specified by the optional key function.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        num : int</span> | 
|  | <span class="sd">            top N</span> | 
|  | <span class="sd">        key : function, optional</span> | 
|  | <span class="sd">            a function used to generate key for comparing</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            the top N elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.top`</span> | 
|  | <span class="sd">        :meth:`RDD.max`</span> | 
|  | <span class="sd">        :meth:`RDD.min`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting array is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)</span> | 
|  | <span class="sd">        [1, 2, 3, 4, 5, 6]</span> | 
|  | <span class="sd">        >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)</span> | 
|  | <span class="sd">        [10, 9, 7, 6, 5, 4]</span> | 
|  | <span class="sd">        >>> sc.emptyRDD().takeOrdered(3)</span> | 
|  | <span class="sd">        []</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">num</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"top N cannot be negative."</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">num</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="p">[]</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">merge</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">],</span> <span class="n">b</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nsmallest</span><span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">it</span><span class="p">:</span> <span class="p">[</span><span class="n">heapq</span><span class="o">.</span><span class="n">nsmallest</span><span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">it</span><span class="p">,</span> <span class="n">key</span><span class="p">)])</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">merge</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.take"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.take.html#pyspark.RDD.take">[docs]</a>    <span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Take the first num elements of the RDD.</span> | 
|  |  | 
|  | <span class="sd">        It works by first scanning one partition, and use the results from</span> | 
|  | <span class="sd">        that partition to estimate the number of additional partitions needed</span> | 
|  | <span class="sd">        to satisfy the limit.</span> | 
|  |  | 
|  | <span class="sd">        Translated from the Scala implementation in RDD#take().</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        num : int</span> | 
|  | <span class="sd">            first number of elements</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            the first `num` elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.first`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.take`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting array is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)</span> | 
|  | <span class="sd">        [2, 3]</span> | 
|  | <span class="sd">        >>> sc.parallelize([2, 3, 4, 5, 6]).take(10)</span> | 
|  | <span class="sd">        [2, 3, 4, 5, 6]</span> | 
|  | <span class="sd">        >>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3)</span> | 
|  | <span class="sd">        [91, 92, 93]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">items</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">T</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span> | 
|  | <span class="n">totalParts</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> | 
|  | <span class="n">partsScanned</span> <span class="o">=</span> <span class="mi">0</span> | 
|  |  | 
|  | <span class="k">while</span> <span class="nb">len</span><span class="p">(</span><span class="n">items</span><span class="p">)</span> <span class="o"><</span> <span class="n">num</span> <span class="ow">and</span> <span class="n">partsScanned</span> <span class="o"><</span> <span class="n">totalParts</span><span class="p">:</span> | 
|  | <span class="c1"># The number of partitions to try in this iteration.</span> | 
|  | <span class="c1"># It is ok for this number to be greater than totalParts because</span> | 
|  | <span class="c1"># we actually cap it at totalParts in runJob.</span> | 
|  | <span class="n">numPartsToTry</span> <span class="o">=</span> <span class="mi">1</span> | 
|  | <span class="k">if</span> <span class="n">partsScanned</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="c1"># If we didn't find any rows after the previous iteration,</span> | 
|  | <span class="c1"># quadruple and retry.  Otherwise, interpolate the number of</span> | 
|  | <span class="c1"># partitions we need to try, but overestimate it by 50%.</span> | 
|  | <span class="c1"># We also cap the estimation in the end.</span> | 
|  | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">items</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="n">numPartsToTry</span> <span class="o">=</span> <span class="n">partsScanned</span> <span class="o">*</span> <span class="mi">4</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="c1"># the first parameter of max is >=1 whenever partsScanned >= 2</span> | 
|  | <span class="n">numPartsToTry</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="mf">1.5</span> <span class="o">*</span> <span class="n">num</span> <span class="o">*</span> <span class="n">partsScanned</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">items</span><span class="p">))</span> <span class="o">-</span> <span class="n">partsScanned</span> | 
|  | <span class="n">numPartsToTry</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="nb">max</span><span class="p">(</span><span class="n">numPartsToTry</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">partsScanned</span> <span class="o">*</span> <span class="mi">4</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">left</span> <span class="o">=</span> <span class="n">num</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="n">items</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">takeUpToNumLeft</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="n">iterator</span> <span class="o">=</span> <span class="nb">iter</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="n">taken</span> <span class="o">=</span> <span class="mi">0</span> | 
|  | <span class="k">while</span> <span class="n">taken</span> <span class="o"><</span> <span class="n">left</span><span class="p">:</span> | 
|  | <span class="k">try</span><span class="p">:</span> | 
|  | <span class="k">yield</span> <span class="nb">next</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">except</span> <span class="ne">StopIteration</span><span class="p">:</span> | 
|  | <span class="k">return</span> | 
|  | <span class="n">taken</span> <span class="o">+=</span> <span class="mi">1</span> | 
|  |  | 
|  | <span class="n">p</span> <span class="o">=</span> <span class="nb">range</span><span class="p">(</span><span class="n">partsScanned</span><span class="p">,</span> <span class="nb">min</span><span class="p">(</span><span class="n">partsScanned</span> <span class="o">+</span> <span class="n">numPartsToTry</span><span class="p">,</span> <span class="n">totalParts</span><span class="p">))</span> | 
|  | <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">context</span><span class="o">.</span><span class="n">runJob</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">takeUpToNumLeft</span><span class="p">,</span> <span class="n">p</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">items</span> <span class="o">+=</span> <span class="n">res</span> | 
|  | <span class="n">partsScanned</span> <span class="o">+=</span> <span class="n">numPartsToTry</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">items</span><span class="p">[:</span><span class="n">num</span><span class="p">]</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.first"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.first.html#pyspark.RDD.first">[docs]</a>    <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">T</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the first element in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        T</span> | 
|  | <span class="sd">            the first element</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.take`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.first`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.head`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([2, 3, 4]).first()</span> | 
|  | <span class="sd">        2</span> | 
|  | <span class="sd">        >>> sc.parallelize([]).first()</span> | 
|  | <span class="sd">        Traceback (most recent call last):</span> | 
|  | <span class="sd">            ...</span> | 
|  | <span class="sd">        ValueError: RDD is empty</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">rs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">rs</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">rs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"RDD is empty"</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.isEmpty"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.isEmpty.html#pyspark.RDD.isEmpty">[docs]</a>    <span class="k">def</span> <span class="nf">isEmpty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns true if and only if the RDD contains no elements at all.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        bool</span> | 
|  | <span class="sd">            whether the :class:`RDD` is empty</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.first`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.isEmpty`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        An RDD may be empty even when it has at least 1 partition.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([]).isEmpty()</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> sc.parallelize([1]).isEmpty()</span> | 
|  | <span class="sd">        False</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span> <span class="o">==</span> <span class="mi">0</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsNewAPIHadoopDataset"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsNewAPIHadoopDataset.html#pyspark.RDD.saveAsNewAPIHadoopDataset">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsNewAPIHadoopDataset</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">conf</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> | 
|  | <span class="n">keyConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file</span> | 
|  | <span class="sd">        system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are</span> | 
|  | <span class="sd">        converted for output using either user specified converters or, by default,</span> | 
|  | <span class="sd">        "org.apache.spark.api.python.JavaToWritableConverter".</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        conf : dict</span> | 
|  | <span class="sd">            Hadoop job configuration</span> | 
|  | <span class="sd">        keyConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key converter (None by default)</span> | 
|  | <span class="sd">        valueConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value converter (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.newAPIHadoopRDD`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsSequenceFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  |  | 
|  | <span class="sd">        Set the related classes</span> | 
|  |  | 
|  | <span class="sd">        >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"</span> | 
|  | <span class="sd">        >>> input_format_class = "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"</span> | 
|  | <span class="sd">        >>> key_class = "org.apache.hadoop.io.IntWritable"</span> | 
|  | <span class="sd">        >>> value_class = "org.apache.hadoop.io.Text"</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "new_hadoop_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Create the conf for writing</span> | 
|  | <span class="sd">        ...     write_conf = {</span> | 
|  | <span class="sd">        ...         "mapreduce.job.outputformat.class": (output_format_class),</span> | 
|  | <span class="sd">        ...         "mapreduce.job.output.key.class": key_class,</span> | 
|  | <span class="sd">        ...         "mapreduce.job.output.value.class": value_class,</span> | 
|  | <span class="sd">        ...         "mapreduce.output.fileoutputformat.outputdir": path,</span> | 
|  | <span class="sd">        ...     }</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary Hadoop file</span> | 
|  | <span class="sd">        ...     rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])</span> | 
|  | <span class="sd">        ...     rdd.saveAsNewAPIHadoopDataset(conf=write_conf)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Create the conf for reading</span> | 
|  | <span class="sd">        ...     read_conf = {"mapreduce.input.fileinputformat.inputdir": path}</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load this Hadoop file as an RDD</span> | 
|  | <span class="sd">        ...     loaded = sc.newAPIHadoopRDD(input_format_class,</span> | 
|  | <span class="sd">        ...         key_class, value_class, conf=read_conf)</span> | 
|  | <span class="sd">        ...     sorted(loaded.collect())</span> | 
|  | <span class="sd">        [(1, ''), (1, 'a'), (3, 'x')]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jconf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_dictToJavaMap</span><span class="p">(</span><span class="n">conf</span><span class="p">)</span> | 
|  | <span class="n">pickledRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">saveAsHadoopDataset</span><span class="p">(</span> | 
|  | <span class="n">pickledRDD</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">jconf</span><span class="p">,</span> <span class="n">keyConverter</span><span class="p">,</span> <span class="n">valueConverter</span><span class="p">,</span> <span class="kc">True</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsNewAPIHadoopFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsNewAPIHadoopFile.html#pyspark.RDD.saveAsNewAPIHadoopFile">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsNewAPIHadoopFile</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> | 
|  | <span class="n">outputFormatClass</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> | 
|  | <span class="n">keyClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">keyConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">conf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file</span> | 
|  | <span class="sd">        system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types</span> | 
|  | <span class="sd">        will be inferred if not specified. Keys and values are converted for output using either</span> | 
|  | <span class="sd">        user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The</span> | 
|  | <span class="sd">        `conf` is applied on top of the base Hadoop conf associated with the SparkContext</span> | 
|  | <span class="sd">        of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        path : str</span> | 
|  | <span class="sd">            path to Hadoop file</span> | 
|  | <span class="sd">        outputFormatClass : str</span> | 
|  | <span class="sd">            fully qualified classname of Hadoop OutputFormat</span> | 
|  | <span class="sd">            (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")</span> | 
|  | <span class="sd">        keyClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key Writable class</span> | 
|  | <span class="sd">             (e.g. "org.apache.hadoop.io.IntWritable", None by default)</span> | 
|  | <span class="sd">        valueClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value Writable class</span> | 
|  | <span class="sd">            (e.g. "org.apache.hadoop.io.Text", None by default)</span> | 
|  | <span class="sd">        keyConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key converter (None by default)</span> | 
|  | <span class="sd">        valueConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value converter (None by default)</span> | 
|  | <span class="sd">        conf : dict, optional</span> | 
|  | <span class="sd">            Hadoop job configuration (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.newAPIHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsSequenceFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  |  | 
|  | <span class="sd">        Set the class of output format</span> | 
|  |  | 
|  | <span class="sd">        >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "hadoop_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary Hadoop file</span> | 
|  | <span class="sd">        ...     rdd = sc.parallelize([(1, {3.0: "bb"}), (2, {1.0: "aa"}), (3, {2.0: "dd"})])</span> | 
|  | <span class="sd">        ...     rdd.saveAsNewAPIHadoopFile(path, output_format_class)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load this Hadoop file as an RDD</span> | 
|  | <span class="sd">        ...     sorted(sc.sequenceFile(path).collect())</span> | 
|  | <span class="sd">        [(1, {3.0: 'bb'}), (2, {1.0: 'aa'}), (3, {2.0: 'dd'})]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jconf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_dictToJavaMap</span><span class="p">(</span><span class="n">conf</span><span class="p">)</span> | 
|  | <span class="n">pickledRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">saveAsNewAPIHadoopFile</span><span class="p">(</span> | 
|  | <span class="n">pickledRDD</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> | 
|  | <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="n">path</span><span class="p">,</span> | 
|  | <span class="n">outputFormatClass</span><span class="p">,</span> | 
|  | <span class="n">keyClass</span><span class="p">,</span> | 
|  | <span class="n">valueClass</span><span class="p">,</span> | 
|  | <span class="n">keyConverter</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">,</span> | 
|  | <span class="n">jconf</span><span class="p">,</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsHadoopDataset"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsHadoopDataset.html#pyspark.RDD.saveAsHadoopDataset">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsHadoopDataset</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">conf</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> | 
|  | <span class="n">keyConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file</span> | 
|  | <span class="sd">        system, using the old Hadoop OutputFormat API (mapred package). Keys/values are</span> | 
|  | <span class="sd">        converted for output using either user specified converters or, by default,</span> | 
|  | <span class="sd">        "org.apache.spark.api.python.JavaToWritableConverter".</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        conf : dict</span> | 
|  | <span class="sd">            Hadoop job configuration</span> | 
|  | <span class="sd">        keyConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key converter (None by default)</span> | 
|  | <span class="sd">        valueConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value converter (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.hadoopRDD`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsSequenceFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  |  | 
|  | <span class="sd">        Set the related classes</span> | 
|  |  | 
|  | <span class="sd">        >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat"</span> | 
|  | <span class="sd">        >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat"</span> | 
|  | <span class="sd">        >>> key_class = "org.apache.hadoop.io.IntWritable"</span> | 
|  | <span class="sd">        >>> value_class = "org.apache.hadoop.io.Text"</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "old_hadoop_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Create the conf for writing</span> | 
|  | <span class="sd">        ...     write_conf = {</span> | 
|  | <span class="sd">        ...         "mapred.output.format.class": output_format_class,</span> | 
|  | <span class="sd">        ...         "mapreduce.job.output.key.class": key_class,</span> | 
|  | <span class="sd">        ...         "mapreduce.job.output.value.class": value_class,</span> | 
|  | <span class="sd">        ...         "mapreduce.output.fileoutputformat.outputdir": path,</span> | 
|  | <span class="sd">        ...     }</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary Hadoop file</span> | 
|  | <span class="sd">        ...     rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])</span> | 
|  | <span class="sd">        ...     rdd.saveAsHadoopDataset(conf=write_conf)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Create the conf for reading</span> | 
|  | <span class="sd">        ...     read_conf = {"mapreduce.input.fileinputformat.inputdir": path}</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load this Hadoop file as an RDD</span> | 
|  | <span class="sd">        ...     loaded = sc.hadoopRDD(input_format_class, key_class, value_class, conf=read_conf)</span> | 
|  | <span class="sd">        ...     sorted(loaded.collect())</span> | 
|  | <span class="sd">        [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jconf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_dictToJavaMap</span><span class="p">(</span><span class="n">conf</span><span class="p">)</span> | 
|  | <span class="n">pickledRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">saveAsHadoopDataset</span><span class="p">(</span> | 
|  | <span class="n">pickledRDD</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">jconf</span><span class="p">,</span> <span class="n">keyConverter</span><span class="p">,</span> <span class="n">valueConverter</span><span class="p">,</span> <span class="kc">False</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsHadoopFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsHadoopFile.html#pyspark.RDD.saveAsHadoopFile">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsHadoopFile</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> | 
|  | <span class="n">outputFormatClass</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> | 
|  | <span class="n">keyClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">keyConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">conf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">compressionCodecClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file</span> | 
|  | <span class="sd">        system, using the old Hadoop OutputFormat API (mapred package). Key and value types</span> | 
|  | <span class="sd">        will be inferred if not specified. Keys and values are converted for output using either</span> | 
|  | <span class="sd">        user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The</span> | 
|  | <span class="sd">        `conf` is applied on top of the base Hadoop conf associated with the SparkContext</span> | 
|  | <span class="sd">        of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        path : str</span> | 
|  | <span class="sd">            path to Hadoop file</span> | 
|  | <span class="sd">        outputFormatClass : str</span> | 
|  | <span class="sd">            fully qualified classname of Hadoop OutputFormat</span> | 
|  | <span class="sd">            (e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat")</span> | 
|  | <span class="sd">        keyClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key Writable class</span> | 
|  | <span class="sd">            (e.g. "org.apache.hadoop.io.IntWritable", None by default)</span> | 
|  | <span class="sd">        valueClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value Writable class</span> | 
|  | <span class="sd">            (e.g. "org.apache.hadoop.io.Text", None by default)</span> | 
|  | <span class="sd">        keyConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of key converter (None by default)</span> | 
|  | <span class="sd">        valueConverter : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of value converter (None by default)</span> | 
|  | <span class="sd">        conf : dict, optional</span> | 
|  | <span class="sd">            (None by default)</span> | 
|  | <span class="sd">        compressionCodecClass : str</span> | 
|  | <span class="sd">            fully qualified classname of the compression codec class</span> | 
|  | <span class="sd">            i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.hadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsSequenceFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  |  | 
|  | <span class="sd">        Set the related classes</span> | 
|  |  | 
|  | <span class="sd">        >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat"</span> | 
|  | <span class="sd">        >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat"</span> | 
|  | <span class="sd">        >>> key_class = "org.apache.hadoop.io.IntWritable"</span> | 
|  | <span class="sd">        >>> value_class = "org.apache.hadoop.io.Text"</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "old_hadoop_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary Hadoop file</span> | 
|  | <span class="sd">        ...     rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])</span> | 
|  | <span class="sd">        ...     rdd.saveAsHadoopFile(path, output_format_class, key_class, value_class)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load this Hadoop file as an RDD</span> | 
|  | <span class="sd">        ...     loaded = sc.hadoopFile(path, input_format_class, key_class, value_class)</span> | 
|  | <span class="sd">        ...     sorted(loaded.collect())</span> | 
|  | <span class="sd">        [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jconf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_dictToJavaMap</span><span class="p">(</span><span class="n">conf</span><span class="p">)</span> | 
|  | <span class="n">pickledRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">saveAsHadoopFile</span><span class="p">(</span> | 
|  | <span class="n">pickledRDD</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> | 
|  | <span class="kc">True</span><span class="p">,</span> | 
|  | <span class="n">path</span><span class="p">,</span> | 
|  | <span class="n">outputFormatClass</span><span class="p">,</span> | 
|  | <span class="n">keyClass</span><span class="p">,</span> | 
|  | <span class="n">valueClass</span><span class="p">,</span> | 
|  | <span class="n">keyConverter</span><span class="p">,</span> | 
|  | <span class="n">valueConverter</span><span class="p">,</span> | 
|  | <span class="n">jconf</span><span class="p">,</span> | 
|  | <span class="n">compressionCodecClass</span><span class="p">,</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsSequenceFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsSequenceFile.html#pyspark.RDD.saveAsSequenceFile">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsSequenceFile</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">compressionCodecClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file</span> | 
|  | <span class="sd">        system, using the "org.apache.hadoop.io.Writable" types that we convert from the</span> | 
|  | <span class="sd">        RDD's key and value types. The mechanism is as follows:</span> | 
|  |  | 
|  | <span class="sd">            1. Pickle is used to convert pickled Python RDD into RDD of Java objects.</span> | 
|  | <span class="sd">            2. Keys and values of this Java RDD are converted to Writables and written out.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        path : str</span> | 
|  | <span class="sd">            path to sequence file</span> | 
|  | <span class="sd">        compressionCodecClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of the compression codec class</span> | 
|  | <span class="sd">            i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.sequenceFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopFile`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsNewAPIHadoopDataset`</span> | 
|  | <span class="sd">        :meth:`RDD.saveAsSequenceFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  |  | 
|  | <span class="sd">        Set the related classes</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "sequence_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary sequence file</span> | 
|  | <span class="sd">        ...     rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")])</span> | 
|  | <span class="sd">        ...     rdd.saveAsSequenceFile(path)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load this sequence file as an RDD</span> | 
|  | <span class="sd">        ...     loaded = sc.sequenceFile(path)</span> | 
|  | <span class="sd">        ...     sorted(loaded.collect())</span> | 
|  | <span class="sd">        [(1, ''), (1, 'a'), (3, 'x')]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">pickledRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">saveAsSequenceFile</span><span class="p">(</span> | 
|  | <span class="n">pickledRDD</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">compressionCodecClass</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsPickleFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsPickleFile.html#pyspark.RDD.saveAsPickleFile">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsPickleFile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">batchSize</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Save this RDD as a SequenceFile of serialized objects. The serializer</span> | 
|  | <span class="sd">        used is :class:`pyspark.serializers.CPickleSerializer`, default batch size</span> | 
|  | <span class="sd">        is 10.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        path : str</span> | 
|  | <span class="sd">            path to pickled file</span> | 
|  | <span class="sd">        batchSize : int, optional, default 10</span> | 
|  | <span class="sd">            the number of Python objects represented as a single Java object.</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.pickleFile`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d:</span> | 
|  | <span class="sd">        ...     path = os.path.join(d, "pickle_file")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary pickled file</span> | 
|  | <span class="sd">        ...     sc.parallelize(range(10)).saveAsPickleFile(path, 3)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load picked file as an RDD</span> | 
|  | <span class="sd">        ...     sorted(sc.pickleFile(path, 3).collect())</span> | 
|  | <span class="sd">        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">ser</span><span class="p">:</span> <span class="n">Serializer</span> | 
|  | <span class="k">if</span> <span class="n">batchSize</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="n">ser</span> <span class="o">=</span> <span class="n">AutoBatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">())</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">ser</span> <span class="o">=</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">(),</span> <span class="n">batchSize</span><span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">(</span><span class="n">ser</span><span class="p">)</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">saveAsObjectFile</span><span class="p">(</span><span class="n">path</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.saveAsTextFile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.saveAsTextFile.html#pyspark.RDD.saveAsTextFile">[docs]</a>    <span class="k">def</span> <span class="nf">saveAsTextFile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">compressionCodecClass</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Save this RDD as a text file, using string representations of elements.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        path : str</span> | 
|  | <span class="sd">            path to text file</span> | 
|  | <span class="sd">        compressionCodecClass : str, optional</span> | 
|  | <span class="sd">            fully qualified classname of the compression codec class</span> | 
|  | <span class="sd">            i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`SparkContext.textFile`</span> | 
|  | <span class="sd">        :meth:`SparkContext.wholeTextFiles`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> import os</span> | 
|  | <span class="sd">        >>> import tempfile</span> | 
|  | <span class="sd">        >>> from fileinput import input</span> | 
|  | <span class="sd">        >>> from glob import glob</span> | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d1:</span> | 
|  | <span class="sd">        ...     path1 = os.path.join(d1, "text_file1")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write a temporary text file</span> | 
|  | <span class="sd">        ...     sc.parallelize(range(10)).saveAsTextFile(path1)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load text file as an RDD</span> | 
|  | <span class="sd">        ...     ''.join(sorted(input(glob(path1 + "/part-0000*"))))</span> | 
|  | <span class="sd">        '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'</span> | 
|  |  | 
|  | <span class="sd">        Empty lines are tolerated when saving to text files.</span> | 
|  |  | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d2:</span> | 
|  | <span class="sd">        ...     path2 = os.path.join(d2, "text2_file2")</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write another temporary text file</span> | 
|  | <span class="sd">        ...     sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(path2)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load text file as an RDD</span> | 
|  | <span class="sd">        ...     ''.join(sorted(input(glob(path2 + "/part-0000*"))))</span> | 
|  | <span class="sd">        '\\n\\n\\nbar\\nfoo\\n'</span> | 
|  |  | 
|  | <span class="sd">        Using compressionCodecClass</span> | 
|  |  | 
|  | <span class="sd">        >>> from fileinput import input, hook_compressed</span> | 
|  | <span class="sd">        >>> with tempfile.TemporaryDirectory() as d3:</span> | 
|  | <span class="sd">        ...     path3 = os.path.join(d3, "text3")</span> | 
|  | <span class="sd">        ...     codec = "org.apache.hadoop.io.compress.GzipCodec"</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Write another temporary text file with specified codec</span> | 
|  | <span class="sd">        ...     sc.parallelize(['foo', 'bar']).saveAsTextFile(path3, codec)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        ...     # Load text file as an RDD</span> | 
|  | <span class="sd">        ...     result = sorted(input(glob(path3 + "/part*.gz"), openhook=hook_compressed))</span> | 
|  | <span class="sd">        ...     ''.join([r.decode('utf-8') if isinstance(r, bytes) else r for r in result])</span> | 
|  | <span class="sd">        'bar\\nfoo\\n'</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">split</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]:</span> | 
|  | <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">):</span> | 
|  | <span class="k">yield</span> <span class="n">x</span> | 
|  | <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> | 
|  | <span class="k">yield</span> <span class="n">x</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">yield</span> <span class="nb">str</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">keyed</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> | 
|  | <span class="n">keyed</span><span class="o">.</span><span class="n">_bypass_serializer</span> <span class="o">=</span> <span class="kc">True</span>  <span class="c1"># type: ignore[attr-defined]</span> | 
|  |  | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">compressionCodecClass</span><span class="p">:</span> | 
|  | <span class="n">compressionCodec</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">java</span><span class="o">.</span><span class="n">lang</span><span class="o">.</span><span class="n">Class</span><span class="o">.</span><span class="n">forName</span><span class="p">(</span><span class="n">compressionCodecClass</span><span class="p">)</span> | 
|  | <span class="n">keyed</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">BytesToString</span><span class="p">())</span><span class="o">.</span><span class="n">saveAsTextFile</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">compressionCodec</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">keyed</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">BytesToString</span><span class="p">())</span><span class="o">.</span><span class="n">saveAsTextFile</span><span class="p">(</span><span class="n">path</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="c1"># Pair functions</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.collectAsMap"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.collectAsMap.html#pyspark.RDD.collectAsMap">[docs]</a>    <span class="k">def</span> <span class="nf">collectAsMap</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the key-value pairs in this RDD to the master as a dictionary.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`dict`</span> | 
|  | <span class="sd">            a dictionary of (key, value) pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.countByValue`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This method should only be used if the resulting data is expected</span> | 
|  | <span class="sd">        to be small, as all the data is loaded into the driver's memory.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()</span> | 
|  | <span class="sd">        >>> m[1]</span> | 
|  | <span class="sd">        2</span> | 
|  | <span class="sd">        >>> m[3]</span> | 
|  | <span class="sd">        4</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.keys"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.keys.html#pyspark.RDD.keys">[docs]</a>    <span class="k">def</span> <span class="nf">keys</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[K]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD with the keys of each tuple.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` only containing the keys</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.values`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([(1, 2), (3, 4)]).keys()</span> | 
|  | <span class="sd">        >>> rdd.collect()</span> | 
|  | <span class="sd">        [1, 3]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.values"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.values.html#pyspark.RDD.values">[docs]</a>    <span class="k">def</span> <span class="nf">values</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[V]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD with the values of each tuple.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` only containing the values</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.keys`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([(1, 2), (3, 4)]).values()</span> | 
|  | <span class="sd">        >>> rdd.collect()</span> | 
|  | <span class="sd">        [2, 4]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.reduceByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.reduceByKey.html#pyspark.RDD.reduceByKey">[docs]</a>    <span class="k">def</span> <span class="nf">reduceByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">V</span><span class="p">],</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Merge the values for each key using an associative and commutative reduce function.</span> | 
|  |  | 
|  | <span class="sd">        This will also perform the merging locally on each mapper before</span> | 
|  | <span class="sd">        sending results to a reducer, similarly to a "combiner" in MapReduce.</span> | 
|  |  | 
|  | <span class="sd">        Output will be partitioned with `numPartitions` partitions, or</span> | 
|  | <span class="sd">        the default parallelism level if `numPartitions` is not specified.</span> | 
|  | <span class="sd">        Default partitioner is hash-partition.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.6.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        func : function</span> | 
|  | <span class="sd">            the reduce function</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the aggregated result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKeyLocally`</span> | 
|  | <span class="sd">        :meth:`RDD.combineByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregateByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.foldByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.groupByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> from operator import add</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</span> | 
|  | <span class="sd">        >>> sorted(rdd.reduceByKey(add).collect())</span> | 
|  | <span class="sd">        [('a', 2), ('b', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">combineByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.reduceByKeyLocally"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.reduceByKeyLocally.html#pyspark.RDD.reduceByKeyLocally">[docs]</a>    <span class="k">def</span> <span class="nf">reduceByKeyLocally</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Merge the values for each key using an associative and commutative reduce function, but</span> | 
|  | <span class="sd">        return the results immediately to the master as a dictionary.</span> | 
|  |  | 
|  | <span class="sd">        This will also perform the merging locally on each mapper before</span> | 
|  | <span class="sd">        sending results to a reducer, similarly to a "combiner" in MapReduce.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        func : function</span> | 
|  | <span class="sd">            the reduce function</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        dict</span> | 
|  | <span class="sd">            a dict containing the keys and the aggregated result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregateByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> from operator import add</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</span> | 
|  | <span class="sd">        >>> sorted(rdd.reduceByKeyLocally(add).items())</span> | 
|  | <span class="sd">        [('a', 2), ('b', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">func</span> <span class="o">=</span> <span class="n">fail_on_stopiteration</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">reducePartition</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]]:</span> | 
|  | <span class="n">m</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> | 
|  | <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">m</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">m</span><span class="p">[</span><span class="n">k</span><span class="p">],</span> <span class="n">v</span><span class="p">)</span> <span class="k">if</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">m</span> <span class="k">else</span> <span class="n">v</span> | 
|  | <span class="k">yield</span> <span class="n">m</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mergeMaps</span><span class="p">(</span><span class="n">m1</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">m2</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]:</span> | 
|  | <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">m2</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | 
|  | <span class="n">m1</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">m1</span><span class="p">[</span><span class="n">k</span><span class="p">],</span> <span class="n">v</span><span class="p">)</span> <span class="k">if</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">m1</span> <span class="k">else</span> <span class="n">v</span> | 
|  | <span class="k">return</span> <span class="n">m1</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">reducePartition</span><span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">mergeMaps</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.countByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.countByKey.html#pyspark.RDD.countByKey">[docs]</a>    <span class="k">def</span> <span class="nf">countByKey</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Count the number of elements for each key, and return the result to the</span> | 
|  | <span class="sd">        master as a dictionary.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        dict</span> | 
|  | <span class="sd">            a dictionary of (key, count) pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.collectAsMap`</span> | 
|  | <span class="sd">        :meth:`RDD.countByValue`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</span> | 
|  | <span class="sd">        >>> sorted(rdd.countByKey().items())</span> | 
|  | <span class="sd">        [('a', 2), ('b', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">countByValue</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.join"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.join.html#pyspark.RDD.join">[docs]</a>    <span class="k">def</span> <span class="nf">join</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[V, U]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an RDD containing all pairs of elements with matching keys in</span> | 
|  | <span class="sd">        `self` and `other`.</span> | 
|  |  | 
|  | <span class="sd">        Each pair of elements will be returned as a (k, (v1, v2)) tuple, where</span> | 
|  | <span class="sd">        (k, v1) is in `self` and (k, v2) is in `other`.</span> | 
|  |  | 
|  | <span class="sd">        Performs a hash join across the cluster.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing all pairs of elements with matching keys</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.leftOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.rightOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.fullOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.cogroup`</span> | 
|  | <span class="sd">        :meth:`RDD.groupWith`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 2), ("a", 3)])</span> | 
|  | <span class="sd">        >>> sorted(rdd1.join(rdd2).collect())</span> | 
|  | <span class="sd">        [('a', (1, 2)), ('a', (1, 3))]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_join</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.leftOuterJoin"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.leftOuterJoin.html#pyspark.RDD.leftOuterJoin">[docs]</a>    <span class="k">def</span> <span class="nf">leftOuterJoin</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[V, Optional[U]]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Perform a left outer join of `self` and `other`.</span> | 
|  |  | 
|  | <span class="sd">        For each element (k, v) in `self`, the resulting RDD will either</span> | 
|  | <span class="sd">        contain all pairs (k, (v, w)) for w in `other`, or the pair</span> | 
|  | <span class="sd">        (k, (v, None)) if no elements in `other` have key k.</span> | 
|  |  | 
|  | <span class="sd">        Hash-partitions the resulting RDD into the given number of partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing all pairs of elements with matching keys</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.join`</span> | 
|  | <span class="sd">        :meth:`RDD.rightOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.fullOuterJoin`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 2)])</span> | 
|  | <span class="sd">        >>> sorted(rdd1.leftOuterJoin(rdd2).collect())</span> | 
|  | <span class="sd">        [('a', (1, 2)), ('b', (4, None))]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_left_outer_join</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.rightOuterJoin"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.rightOuterJoin.html#pyspark.RDD.rightOuterJoin">[docs]</a>    <span class="k">def</span> <span class="nf">rightOuterJoin</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[Optional[V], U]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Perform a right outer join of `self` and `other`.</span> | 
|  |  | 
|  | <span class="sd">        For each element (k, w) in `other`, the resulting RDD will either</span> | 
|  | <span class="sd">        contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w))</span> | 
|  | <span class="sd">        if no elements in `self` have key k.</span> | 
|  |  | 
|  | <span class="sd">        Hash-partitions the resulting RDD into the given number of partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing all pairs of elements with matching keys</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.join`</span> | 
|  | <span class="sd">        :meth:`RDD.leftOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.fullOuterJoin`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 2)])</span> | 
|  | <span class="sd">        >>> sorted(rdd2.rightOuterJoin(rdd1).collect())</span> | 
|  | <span class="sd">        [('a', (2, 1)), ('b', (None, 4))]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_right_outer_join</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.fullOuterJoin"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.fullOuterJoin.html#pyspark.RDD.fullOuterJoin">[docs]</a>    <span class="k">def</span> <span class="nf">fullOuterJoin</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Perform a right outer join of `self` and `other`.</span> | 
|  |  | 
|  | <span class="sd">        For each element (k, v) in `self`, the resulting RDD will either</span> | 
|  | <span class="sd">        contain all pairs (k, (v, w)) for w in `other`, or the pair</span> | 
|  | <span class="sd">        (k, (v, None)) if no elements in `other` have key k.</span> | 
|  |  | 
|  | <span class="sd">        Similarly, for each element (k, w) in `other`, the resulting RDD will</span> | 
|  | <span class="sd">        either contain all pairs (k, (v, w)) for v in `self`, or the pair</span> | 
|  | <span class="sd">        (k, (None, w)) if no elements in `self` have key k.</span> | 
|  |  | 
|  | <span class="sd">        Hash-partitions the resulting RDD into the given number of partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing all pairs of elements with matching keys</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.join`</span> | 
|  | <span class="sd">        :meth:`RDD.leftOuterJoin`</span> | 
|  | <span class="sd">        :meth:`RDD.fullOuterJoin`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 2), ("c", 8)])</span> | 
|  | <span class="sd">        >>> sorted(rdd1.fullOuterJoin(rdd2).collect())</span> | 
|  | <span class="sd">        [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_full_outer_join</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="c1"># TODO: add option to control map-side combining</span> | 
|  | <span class="c1"># portable_hash is used as default, because builtin hash of None is different</span> | 
|  | <span class="c1"># cross machines.</span> | 
|  | <div class="viewcode-block" id="RDD.partitionBy"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.partitionBy.html#pyspark.RDD.partitionBy">[docs]</a>    <span class="k">def</span> <span class="nf">partitionBy</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a copy of the RDD partitioned using the specified partitioner.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` partitioned using the specified partitioner</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.repartition`</span> | 
|  | <span class="sd">        :meth:`RDD.repartitionAndSortWithinPartitions`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))</span> | 
|  | <span class="sd">        >>> sets = pairs.partitionBy(2).glom().collect()</span> | 
|  | <span class="sd">        >>> len(set(sets[0]).intersection(set(sets[1])))</span> | 
|  | <span class="sd">        0</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_defaultReducePartitions</span><span class="p">()</span> | 
|  | <span class="n">partitioner</span> <span class="o">=</span> <span class="n">Partitioner</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span> <span class="o">==</span> <span class="n">partitioner</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span> | 
|  |  | 
|  | <span class="c1"># Transferring O(n) objects to Java is too expensive.</span> | 
|  | <span class="c1"># Instead, we'll form the hash buckets in Python,</span> | 
|  | <span class="c1"># transferring O(numPartitions) objects to Java.</span> | 
|  | <span class="c1"># Each object is a (splitNumber, [objects]) pair.</span> | 
|  | <span class="c1"># In order to avoid too huge objects, the objects are</span> | 
|  | <span class="c1"># grouped into chunks.</span> | 
|  | <span class="n">outputSerializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_unbatched_serializer</span> | 
|  |  | 
|  | <span class="n">limit</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_memory_limit</span><span class="p">()</span> <span class="o">/</span> <span class="mi">2</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">add_shuffle_key</span><span class="p">(</span><span class="n">split</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]:</span> | 
|  |  | 
|  | <span class="n">buckets</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span> | 
|  | <span class="n">c</span><span class="p">,</span> <span class="n">batch</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="nb">min</span><span class="p">(</span><span class="mi">10</span> <span class="o">*</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="mi">1000</span><span class="p">)</span>  <span class="c1"># type: ignore[operator]</span> | 
|  |  | 
|  | <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> | 
|  | <span class="n">buckets</span><span class="p">[</span><span class="n">partitionFunc</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="o">%</span> <span class="n">numPartitions</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>  <span class="c1"># type: ignore[operator]</span> | 
|  | <span class="n">c</span> <span class="o">+=</span> <span class="mi">1</span> | 
|  |  | 
|  | <span class="c1"># check used memory and avg size of chunk of objects</span> | 
|  | <span class="k">if</span> <span class="n">c</span> <span class="o">%</span> <span class="mi">1000</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">get_used_memory</span><span class="p">()</span> <span class="o">></span> <span class="n">limit</span> <span class="ow">or</span> <span class="n">c</span> <span class="o">></span> <span class="n">batch</span><span class="p">:</span> | 
|  | <span class="n">n</span><span class="p">,</span> <span class="n">size</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">buckets</span><span class="p">),</span> <span class="mi">0</span> | 
|  | <span class="k">for</span> <span class="n">split</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">buckets</span><span class="o">.</span><span class="n">keys</span><span class="p">()):</span> | 
|  | <span class="k">yield</span> <span class="n">pack_long</span><span class="p">(</span><span class="n">split</span><span class="p">)</span> | 
|  | <span class="n">d</span> <span class="o">=</span> <span class="n">outputSerializer</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">buckets</span><span class="p">[</span><span class="n">split</span><span class="p">])</span> | 
|  | <span class="k">del</span> <span class="n">buckets</span><span class="p">[</span><span class="n">split</span><span class="p">]</span> | 
|  | <span class="k">yield</span> <span class="n">d</span> | 
|  | <span class="n">size</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">avg</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">size</span> <span class="o">/</span> <span class="n">n</span><span class="p">)</span> <span class="o">>></span> <span class="mi">20</span> | 
|  | <span class="c1"># let 1M < avg < 10M</span> | 
|  | <span class="k">if</span> <span class="n">avg</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="n">batch</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">,</span> <span class="n">batch</span> <span class="o">*</span> <span class="mf">1.5</span><span class="p">)</span>  <span class="c1"># type: ignore[assignment]</span> | 
|  | <span class="k">elif</span> <span class="n">avg</span> <span class="o">></span> <span class="mi">10</span><span class="p">:</span> | 
|  | <span class="n">batch</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">batch</span> <span class="o">/</span> <span class="mf">1.5</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span> | 
|  | <span class="n">c</span> <span class="o">=</span> <span class="mi">0</span> | 
|  |  | 
|  | <span class="k">for</span> <span class="n">split</span><span class="p">,</span> <span class="n">items</span> <span class="ow">in</span> <span class="n">buckets</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | 
|  | <span class="k">yield</span> <span class="n">pack_long</span><span class="p">(</span><span class="n">split</span><span class="p">)</span> | 
|  | <span class="k">yield</span> <span class="n">outputSerializer</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">items</span><span class="p">)</span> | 
|  |  | 
|  | <span class="n">keyed</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">add_shuffle_key</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | 
|  | <span class="n">keyed</span><span class="o">.</span><span class="n">_bypass_serializer</span> <span class="o">=</span> <span class="kc">True</span>  <span class="c1"># type: ignore[attr-defined]</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">context</span><span class="p">):</span> | 
|  | <span class="n">pairRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PairwiseRDD</span><span class="p">(</span><span class="n">keyed</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">())</span><span class="o">.</span><span class="n">asJavaPairRDD</span><span class="p">()</span> | 
|  | <span class="n">jpartitioner</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonPartitioner</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="nb">id</span><span class="p">(</span><span class="n">partitionFunc</span><span class="p">))</span> | 
|  | <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">valueOfPair</span><span class="p">(</span><span class="n">pairRDD</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">jpartitioner</span><span class="p">))</span> | 
|  | <span class="n">rdd</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span> <span class="o">=</span> <span class="n">RDD</span><span class="p">(</span><span class="n">jrdd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">outputSerializer</span><span class="p">))</span> | 
|  | <span class="n">rdd</span><span class="o">.</span><span class="n">partitioner</span> <span class="o">=</span> <span class="n">partitioner</span> | 
|  | <span class="k">return</span> <span class="n">rdd</span></div> | 
|  |  | 
|  | <span class="c1"># TODO: add control over map-side aggregation</span> | 
|  | <div class="viewcode-block" id="RDD.combineByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.combineByKey.html#pyspark.RDD.combineByKey">[docs]</a>    <span class="k">def</span> <span class="nf">combineByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">createCombiner</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">mergeValue</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">mergeCombiners</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">U</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Generic function to combine the elements for each key using a custom</span> | 
|  | <span class="sd">        set of aggregation functions.</span> | 
|  |  | 
|  | <span class="sd">        Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined</span> | 
|  | <span class="sd">        type" C.</span> | 
|  |  | 
|  | <span class="sd">        To avoid memory allocation, both mergeValue and mergeCombiners are allowed to</span> | 
|  | <span class="sd">        modify and return their first argument instead of creating a new C.</span> | 
|  |  | 
|  | <span class="sd">        In addition, users can control the partitioning of the output RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        createCombiner : function</span> | 
|  | <span class="sd">            a function to turns a V into a C</span> | 
|  | <span class="sd">        mergeValue : function</span> | 
|  | <span class="sd">            a function to merge a V into a C</span> | 
|  | <span class="sd">        mergeCombiners : function</span> | 
|  | <span class="sd">            a function to combine two C's into a single one</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the aggregated result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregateByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.foldByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.groupByKey`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        V and C can be different -- for example, one might group an RDD of type</span> | 
|  | <span class="sd">            (Int, Int) into an RDD of type (Int, List[Int]).</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 2)])</span> | 
|  | <span class="sd">        >>> def to_list(a):</span> | 
|  | <span class="sd">        ...     return [a]</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> def append(a, b):</span> | 
|  | <span class="sd">        ...     a.append(b)</span> | 
|  | <span class="sd">        ...     return a</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> def extend(a, b):</span> | 
|  | <span class="sd">        ...     a.extend(b)</span> | 
|  | <span class="sd">        ...     return a</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> sorted(rdd.combineByKey(to_list, append, extend).collect())</span> | 
|  | <span class="sd">        [('a', [1, 2]), ('b', [1])]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_defaultReducePartitions</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">serializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">serializer</span> | 
|  | <span class="n">memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_memory_limit</span><span class="p">()</span> | 
|  | <span class="n">agg</span> <span class="o">=</span> <span class="n">Aggregator</span><span class="p">(</span><span class="n">createCombiner</span><span class="p">,</span> <span class="n">mergeValue</span><span class="p">,</span> <span class="n">mergeCombiners</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">combineLocally</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">U</span><span class="p">]]:</span> | 
|  | <span class="n">merger</span> <span class="o">=</span> <span class="n">ExternalMerger</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">memory</span> <span class="o">*</span> <span class="mf">0.9</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="n">merger</span><span class="o">.</span><span class="n">mergeValues</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">merger</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">locally_combined</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">combineLocally</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | 
|  | <span class="n">shuffled</span> <span class="o">=</span> <span class="n">locally_combined</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_mergeCombiners</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">U</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">U</span><span class="p">]]:</span> | 
|  | <span class="n">merger</span> <span class="o">=</span> <span class="n">ExternalMerger</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">memory</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="n">merger</span><span class="o">.</span><span class="n">mergeCombiners</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">merger</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">shuffled</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">_mergeCombiners</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.aggregateByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.aggregateByKey.html#pyspark.RDD.aggregateByKey">[docs]</a>    <span class="k">def</span> <span class="nf">aggregateByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">zeroValue</span><span class="p">:</span> <span class="n">U</span><span class="p">,</span> | 
|  | <span class="n">seqFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">combFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">U</span><span class="p">,</span> <span class="n">U</span><span class="p">],</span> <span class="n">U</span><span class="p">],</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Aggregate the values of each key, using given combine functions and a neutral</span> | 
|  | <span class="sd">        "zero value". This function can return a different result type, U, than the type</span> | 
|  | <span class="sd">        of the values in this RDD, V. Thus, we need one operation for merging a V into</span> | 
|  | <span class="sd">        a U and one operation for merging two U's, The former operation is used for merging</span> | 
|  | <span class="sd">        values within a partition, and the latter is used for merging values between</span> | 
|  | <span class="sd">        partitions. To avoid memory allocation, both of these functions are</span> | 
|  | <span class="sd">        allowed to modify and return their first argument instead of creating a new U.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        zeroValue : U</span> | 
|  | <span class="sd">            the initial value for the accumulated result of each partition</span> | 
|  | <span class="sd">        seqFunc : function</span> | 
|  | <span class="sd">            a function to merge a V into a U</span> | 
|  | <span class="sd">        combFunc : function</span> | 
|  | <span class="sd">            a function to combine two U's into a single one</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the aggregated result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.combineByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.foldByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.groupByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 2)])</span> | 
|  | <span class="sd">        >>> seqFunc = (lambda x, y: (x[0] + y, x[1] + 1))</span> | 
|  | <span class="sd">        >>> combFunc = (lambda x, y: (x[0] + y[0], x[1] + y[1]))</span> | 
|  | <span class="sd">        >>> sorted(rdd.aggregateByKey((0, 0), seqFunc, combFunc).collect())</span> | 
|  | <span class="sd">        [('a', (3, 2)), ('b', (1, 1))]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">createZero</span><span class="p">()</span> <span class="o">-></span> <span class="n">U</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span><span class="n">zeroValue</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">combineByKey</span><span class="p">(</span> | 
|  | <span class="k">lambda</span> <span class="n">v</span><span class="p">:</span> <span class="n">seqFunc</span><span class="p">(</span><span class="n">createZero</span><span class="p">(),</span> <span class="n">v</span><span class="p">),</span> <span class="n">seqFunc</span><span class="p">,</span> <span class="n">combFunc</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.foldByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.foldByKey.html#pyspark.RDD.foldByKey">[docs]</a>    <span class="k">def</span> <span class="nf">foldByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">zeroValue</span><span class="p">:</span> <span class="n">V</span><span class="p">,</span> | 
|  | <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">,</span> <span class="n">V</span><span class="p">],</span> <span class="n">V</span><span class="p">],</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Merge the values for each key using an associative function "func"</span> | 
|  | <span class="sd">        and a neutral "zeroValue" which may be added to the result an</span> | 
|  | <span class="sd">        arbitrary number of times, and must not change the result</span> | 
|  | <span class="sd">        (e.g., 0 for addition, or 1 for multiplication.).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        zeroValue : V</span> | 
|  | <span class="sd">            the initial value for the accumulated result of each partition</span> | 
|  | <span class="sd">        func : function</span> | 
|  | <span class="sd">            a function to combine two V's into a single one</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the aggregated result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.combineByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregateByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.groupByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</span> | 
|  | <span class="sd">        >>> from operator import add</span> | 
|  | <span class="sd">        >>> sorted(rdd.foldByKey(0, add).collect())</span> | 
|  | <span class="sd">        [('a', 2), ('b', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">createZero</span><span class="p">()</span> <span class="o">-></span> <span class="n">V</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span><span class="n">zeroValue</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">combineByKey</span><span class="p">(</span> | 
|  | <span class="k">lambda</span> <span class="n">v</span><span class="p">:</span> <span class="n">func</span><span class="p">(</span><span class="n">createZero</span><span class="p">(),</span> <span class="n">v</span><span class="p">),</span> <span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_memory_limit</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">_parse_memory</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.python.worker.memory"</span><span class="p">,</span> <span class="s2">"512m"</span><span class="p">))</span> | 
|  |  | 
|  | <span class="c1"># TODO: support variant with custom partitioner</span> | 
|  | <div class="viewcode-block" id="RDD.groupByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.groupByKey.html#pyspark.RDD.groupByKey">[docs]</a>    <span class="k">def</span> <span class="nf">groupByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">partitionFunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">K</span><span class="p">],</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">portable_hash</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Iterable[V]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Group the values for each key in the RDD into a single sequence.</span> | 
|  | <span class="sd">        Hash-partitions the resulting RDD with numPartitions partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        partitionFunc : function, optional, default `portable_hash`</span> | 
|  | <span class="sd">            function to compute the partition index</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the grouped result for each key</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.reduceByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.combineByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.aggregateByKey`</span> | 
|  | <span class="sd">        :meth:`RDD.foldByKey`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        If you are grouping in order to perform an aggregation (such as a</span> | 
|  | <span class="sd">        sum or average) over each key, using reduceByKey or aggregateByKey will</span> | 
|  | <span class="sd">        provide much better performance.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</span> | 
|  | <span class="sd">        >>> sorted(rdd.groupByKey().mapValues(len).collect())</span> | 
|  | <span class="sd">        [('a', 2), ('b', 1)]</span> | 
|  | <span class="sd">        >>> sorted(rdd.groupByKey().mapValues(list).collect())</span> | 
|  | <span class="sd">        [('a', [1, 1]), ('b', [1])]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">createCombiner</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">V</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="p">[</span><span class="n">x</span><span class="p">]</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mergeValue</span><span class="p">(</span><span class="n">xs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">],</span> <span class="n">x</span><span class="p">:</span> <span class="n">V</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]:</span> | 
|  | <span class="n">xs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">xs</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">mergeCombiners</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">],</span> <span class="n">b</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]:</span> | 
|  | <span class="n">a</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">a</span> | 
|  |  | 
|  | <span class="n">memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_memory_limit</span><span class="p">()</span> | 
|  | <span class="n">serializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  | <span class="n">agg</span> <span class="o">=</span> <span class="n">Aggregator</span><span class="p">(</span><span class="n">createCombiner</span><span class="p">,</span> <span class="n">mergeValue</span><span class="p">,</span> <span class="n">mergeCombiners</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">combine</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]]]:</span> | 
|  | <span class="n">merger</span> <span class="o">=</span> <span class="n">ExternalMerger</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">memory</span> <span class="o">*</span> <span class="mf">0.9</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="n">merger</span><span class="o">.</span><span class="n">mergeValues</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">merger</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> | 
|  |  | 
|  | <span class="n">locally_combined</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">combine</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | 
|  | <span class="n">shuffled</span> <span class="o">=</span> <span class="n">locally_combined</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">partitionFunc</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">groupByKey</span><span class="p">(</span><span class="n">it</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]]])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]]]:</span> | 
|  | <span class="n">merger</span> <span class="o">=</span> <span class="n">ExternalGroupBy</span><span class="p">(</span><span class="n">agg</span><span class="p">,</span> <span class="n">memory</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="n">merger</span><span class="o">.</span><span class="n">mergeCombiners</span><span class="p">(</span><span class="n">it</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">merger</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">shuffled</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="n">groupByKey</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">mapValues</span><span class="p">(</span><span class="n">ResultIterable</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.flatMapValues"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.flatMapValues.html#pyspark.RDD.flatMapValues">[docs]</a>    <span class="k">def</span> <span class="nf">flatMapValues</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]]</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Pass each value in the key-value pair RDD through a flatMap function</span> | 
|  | <span class="sd">        without changing the keys; this also retains the original RDD's</span> | 
|  | <span class="sd">        partitioning.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">           a function to turn a V into a sequence of U</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the flat-mapped value</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.flatMap`</span> | 
|  | <span class="sd">        :meth:`RDD.mapValues`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])])</span> | 
|  | <span class="sd">        >>> def f(x): return x</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> rdd.flatMapValues(f).collect()</span> | 
|  | <span class="sd">        [('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">flat_map_fn</span><span class="p">(</span><span class="n">kv</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">U</span><span class="p">]]:</span> | 
|  | <span class="k">return</span> <span class="p">((</span><span class="n">kv</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">f</span><span class="p">(</span><span class="n">kv</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="n">flat_map_fn</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.mapValues"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.mapValues.html#pyspark.RDD.mapValues">[docs]</a>    <span class="k">def</span> <span class="nf">mapValues</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">V</span><span class="p">],</span> <span class="n">U</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Pass each value in the key-value pair RDD through a map function</span> | 
|  | <span class="sd">        without changing the keys; this also retains the original RDD's</span> | 
|  | <span class="sd">        partitioning.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">           a function to turn a V into a U</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and the mapped value</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.flatMapValues`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])</span> | 
|  | <span class="sd">        >>> def f(x): return len(x)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> rdd.mapValues(f).collect()</span> | 
|  | <span class="sd">        [('a', 3), ('b', 1)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">map_values_fn</span><span class="p">(</span><span class="n">kv</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">kv</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">f</span><span class="p">(</span><span class="n">kv</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">map_values_fn</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">groupWith</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V1]]"</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">groupWith</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V1]]"</span><span class="p">,</span> <span class="n">__o1</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V2]]"</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]]"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">groupWith</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V1]]"</span><span class="p">,</span> | 
|  | <span class="n">_o1</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V2]]"</span><span class="p">,</span> | 
|  | <span class="n">_o2</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V3]]"</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"""RDD[</span> | 
|  | <span class="s2">        Tuple[</span> | 
|  | <span class="s2">            K,</span> | 
|  | <span class="s2">            Tuple[</span> | 
|  | <span class="s2">                ResultIterable[V],</span> | 
|  | <span class="s2">                ResultIterable[V1],</span> | 
|  | <span class="s2">                ResultIterable[V2],</span> | 
|  | <span class="s2">                ResultIterable[V3],</span> | 
|  | <span class="s2">            ],</span> | 
|  | <span class="s2">        ]</span> | 
|  | <span class="s2">    ]"""</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.groupWith"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.groupWith.html#pyspark.RDD.groupWith">[docs]</a>    <span class="k">def</span> <span class="nf">groupWith</span><span class="p">(</span>  <span class="c1"># type: ignore[misc]</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[Any, Any]]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[Any, Any]]"</span><span class="p">,</span> <span class="o">*</span><span class="n">others</span><span class="p">:</span> <span class="s2">"RDD[Tuple[Any, Any]]"</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[Any, Tuple[ResultIterable[Any], ...]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Alias for cogroup but with support for multiple RDDs.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        others : :class:`RDD`</span> | 
|  | <span class="sd">            other :class:`RDD`\\s</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and cogrouped values</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.cogroup`</span> | 
|  | <span class="sd">        :meth:`RDD.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 5), ("b", 6)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd3 = sc.parallelize([("a", 2)])</span> | 
|  | <span class="sd">        >>> rdd4 = sc.parallelize([("b", 42)])</span> | 
|  | <span class="sd">        >>> [(x, tuple(map(list, y))) for x, y in</span> | 
|  | <span class="sd">        ...     sorted(list(rdd1.groupWith(rdd2, rdd3, rdd4).collect()))]</span> | 
|  | <span class="sd">        [('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))]</span> | 
|  |  | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_cogroup</span><span class="p">((</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> <span class="o">+</span> <span class="n">others</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="c1"># TODO: add variant with custom partitioner</span> | 
|  | <div class="viewcode-block" id="RDD.cogroup"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.cogroup.html#pyspark.RDD.cogroup">[docs]</a>    <span class="k">def</span> <span class="nf">cogroup</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, U]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        For each key k in `self` or `other`, return a resulting RDD that</span> | 
|  | <span class="sd">        contains a tuple with the list of values for that key in `self` as</span> | 
|  | <span class="sd">        well as `other`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the keys and cogrouped values</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.groupWith`</span> | 
|  | <span class="sd">        :meth:`RDD.join`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 2)])</span> | 
|  | <span class="sd">        >>> [(x, tuple(map(list, y))) for x, y in sorted(list(rdd1.cogroup(rdd2).collect()))]</span> | 
|  | <span class="sd">        [('a', ([1], [2])), ('b', ([4], []))]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">python_cogroup</span><span class="p">((</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">),</span> <span class="n">numPartitions</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sampleByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sampleByKey.html#pyspark.RDD.sampleByKey">[docs]</a>    <span class="k">def</span> <span class="nf">sampleByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">withReplacement</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> | 
|  | <span class="n">fractions</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> | 
|  | <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a subset of this RDD sampled by key (via stratified sampling).</span> | 
|  | <span class="sd">        Create a sample of this RDD using variable sampling rates for</span> | 
|  | <span class="sd">        different keys as specified by fractions, a key to sampling rate map.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.7.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        withReplacement : bool</span> | 
|  | <span class="sd">            whether to sample with or without replacement</span> | 
|  | <span class="sd">        fractions : dict</span> | 
|  | <span class="sd">            map of specific keys to sampling rates</span> | 
|  | <span class="sd">        seed : int, optional</span> | 
|  | <span class="sd">            seed for the random number generator</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the stratified sampling result</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.sample`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> fractions = {"a": 0.2, "b": 0.1}</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))</span> | 
|  | <span class="sd">        >>> sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())</span> | 
|  | <span class="sd">        >>> 100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> max(sample["a"]) <= 999 and min(sample["a"]) >= 0</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> max(sample["b"]) <= 999 and min(sample["b"]) >= 0</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">for</span> <span class="n">fraction</span> <span class="ow">in</span> <span class="n">fractions</span><span class="o">.</span><span class="n">values</span><span class="p">():</span> | 
|  | <span class="k">assert</span> <span class="n">fraction</span> <span class="o">>=</span> <span class="mf">0.0</span><span class="p">,</span> <span class="s2">"Negative fraction value: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">fraction</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span> | 
|  | <span class="n">RDDStratifiedSampler</span><span class="p">(</span><span class="n">withReplacement</span><span class="p">,</span> <span class="n">fractions</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span><span class="o">.</span><span class="n">func</span><span class="p">,</span> <span class="kc">True</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.subtractByKey"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.subtractByKey.html#pyspark.RDD.subtractByKey">[docs]</a>    <span class="k">def</span> <span class="nf">subtractByKey</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> | 
|  | <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, Any]]"</span><span class="p">,</span> | 
|  | <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return each (key, value) pair in `self` that has no pair with matching</span> | 
|  | <span class="sd">        key in `other`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` with the pairs from this whose keys are not in `other`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.subtract`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 3), ("c", None)])</span> | 
|  | <span class="sd">        >>> sorted(rdd1.subtractByKey(rdd2).collect())</span> | 
|  | <span class="sd">        [('b', 4), ('b', 5)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">filter_func</span><span class="p">(</span><span class="n">pair</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">K</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">V</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="n">key</span><span class="p">,</span> <span class="p">(</span><span class="n">val1</span><span class="p">,</span> <span class="n">val2</span><span class="p">)</span> <span class="o">=</span> <span class="n">pair</span> | 
|  | <span class="k">return</span> <span class="n">val1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">val2</span>  <span class="c1"># type: ignore[return-value]</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">cogroup</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span> | 
|  | <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">filter_func</span><span class="p">)</span>  <span class="c1"># type: ignore[arg-type]</span> | 
|  | <span class="o">.</span><span class="n">flatMapValues</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.subtract"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.subtract.html#pyspark.RDD.subtract">[docs]</a>    <span class="k">def</span> <span class="nf">subtract</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return each value in `self` that is not contained in `other`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` with the elements from this that are not in `other`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.subtractByKey`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)])</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([("a", 3), ("c", None)])</span> | 
|  | <span class="sd">        >>> sorted(rdd1.subtract(rdd2).collect())</span> | 
|  | <span class="sd">        [('a', 1), ('b', 4), ('b', 5)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="c1"># note: here 'True' is just a placeholder</span> | 
|  | <span class="n">rdd</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="kc">True</span><span class="p">))</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="kc">True</span><span class="p">))</span><span class="o">.</span><span class="n">subtractByKey</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">)</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.keyBy"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.keyBy.html#pyspark.RDD.keyBy">[docs]</a>    <span class="k">def</span> <span class="nf">keyBy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">T</span><span class="p">],</span> <span class="n">K</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[K, T]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Creates tuples of the elements in this RDD by applying `f`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 0.9.1</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">            a function to compute the key</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` with the elements from this that are not in `other`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.map`</span> | 
|  | <span class="sd">        :meth:`RDD.keys`</span> | 
|  | <span class="sd">        :meth:`RDD.values`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize(zip(range(0,5), range(0,5)))</span> | 
|  | <span class="sd">        >>> [(x, list(map(list, y))) for x, y in sorted(rdd1.cogroup(rdd2).collect())]</span> | 
|  | <span class="sd">        [(0, [[0], [0]]), (1, [[1], [1]]), (2, [[], [2]]), (3, [[], [3]]), (4, [[2], [4]])]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">f</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.repartition"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.repartition.html#pyspark.RDD.repartition">[docs]</a>    <span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">         Return a new RDD that has exactly numPartitions partitions.</span> | 
|  |  | 
|  | <span class="sd">         Can increase or decrease the level of parallelism in this RDD.</span> | 
|  | <span class="sd">         Internally, this uses a shuffle to redistribute data.</span> | 
|  | <span class="sd">         If you are decreasing the number of partitions in this RDD, consider</span> | 
|  | <span class="sd">         using `coalesce`, which can avoid performing a shuffle.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` with exactly numPartitions partitions</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.coalesce`</span> | 
|  | <span class="sd">        :meth:`RDD.partitionBy`</span> | 
|  | <span class="sd">        :meth:`RDD.repartitionAndSortWithinPartitions`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">         >>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)</span> | 
|  | <span class="sd">         >>> sorted(rdd.glom().collect())</span> | 
|  | <span class="sd">         [[1], [2, 3], [4, 5], [6, 7]]</span> | 
|  | <span class="sd">         >>> len(rdd.repartition(2).glom().collect())</span> | 
|  | <span class="sd">         2</span> | 
|  | <span class="sd">         >>> len(rdd.repartition(10).glom().collect())</span> | 
|  | <span class="sd">         10</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">shuffle</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.coalesce"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.coalesce.html#pyspark.RDD.coalesce">[docs]</a>    <span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">shuffle</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return a new RDD that is reduced into `numPartitions` partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        numPartitions : int, optional</span> | 
|  | <span class="sd">            the number of partitions in new :class:`RDD`</span> | 
|  | <span class="sd">        shuffle : bool, optional, default False</span> | 
|  | <span class="sd">            whether to add a shuffle step</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` that is reduced into `numPartitions` partitions</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.repartition`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()</span> | 
|  | <span class="sd">        [[1], [2, 3], [4, 5]]</span> | 
|  | <span class="sd">        >>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()</span> | 
|  | <span class="sd">        [[1, 2, 3, 4, 5]]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="n">numPartitions</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Number of partitions must be positive."</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">shuffle</span><span class="p">:</span> | 
|  | <span class="c1"># Decrease the batch size in order to distribute evenly the elements across output</span> | 
|  | <span class="c1"># partitions. Otherwise, repartition will possibly produce highly skewed partitions.</span> | 
|  | <span class="n">batchSize</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_batchSize</span> <span class="ow">or</span> <span class="mi">1024</span><span class="p">)</span> | 
|  | <span class="n">ser</span> <span class="o">=</span> <span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">(),</span> <span class="n">batchSize</span><span class="p">)</span> | 
|  | <span class="n">selfCopy</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">(</span><span class="n">ser</span><span class="p">)</span> | 
|  | <span class="n">jrdd_deserializer</span> <span class="o">=</span> <span class="n">selfCopy</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  | <span class="n">jrdd</span> <span class="o">=</span> <span class="n">selfCopy</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">shuffle</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">jrdd_deserializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  | <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">,</span> <span class="n">shuffle</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">RDD</span><span class="p">(</span><span class="n">jrdd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="n">jrdd_deserializer</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.zip"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.zip.html#pyspark.RDD.zip">[docs]</a>    <span class="k">def</span> <span class="nf">zip</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"RDD[U]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[T, U]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Zips this RDD with another one, returning key-value pairs with the</span> | 
|  | <span class="sd">        first element in each RDD second element in each RDD, etc. Assumes</span> | 
|  | <span class="sd">        that the two RDDs have the same number of partitions and the same</span> | 
|  | <span class="sd">        number of elements in each partition (e.g. one was made through</span> | 
|  | <span class="sd">        a map on the other).</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        other : :class:`RDD`</span> | 
|  | <span class="sd">            another :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the zipped key-value pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.zipWithIndex`</span> | 
|  | <span class="sd">        :meth:`RDD.zipWithUniqueId`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd1 = sc.parallelize(range(0,5))</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize(range(1000, 1005))</span> | 
|  | <span class="sd">        >>> rdd1.zip(rdd2).collect()</span> | 
|  | <span class="sd">        [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">get_batch_size</span><span class="p">(</span><span class="n">ser</span><span class="p">:</span> <span class="n">Serializer</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ser</span><span class="p">,</span> <span class="n">BatchedSerializer</span><span class="p">):</span> | 
|  | <span class="k">return</span> <span class="n">ser</span><span class="o">.</span><span class="n">batchSize</span> | 
|  | <span class="k">return</span> <span class="mi">1</span>  <span class="c1"># not batched</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">batch_as</span><span class="p">(</span><span class="n">rdd</span><span class="p">:</span> <span class="s2">"RDD[V]"</span><span class="p">,</span> <span class="n">batchSize</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[V]"</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">rdd</span><span class="o">.</span><span class="n">_reserialize</span><span class="p">(</span><span class="n">BatchedSerializer</span><span class="p">(</span><span class="n">CPickleSerializer</span><span class="p">(),</span> <span class="n">batchSize</span><span class="p">))</span> | 
|  |  | 
|  | <span class="n">my_batch</span> <span class="o">=</span> <span class="n">get_batch_size</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">)</span> | 
|  | <span class="n">other_batch</span> <span class="o">=</span> <span class="n">get_batch_size</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">my_batch</span> <span class="o">!=</span> <span class="n">other_batch</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">my_batch</span><span class="p">:</span> | 
|  | <span class="c1"># use the smallest batchSize for both of them</span> | 
|  | <span class="n">batchSize</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">my_batch</span><span class="p">,</span> <span class="n">other_batch</span><span class="p">)</span> | 
|  | <span class="k">if</span> <span class="n">batchSize</span> <span class="o"><=</span> <span class="mi">0</span><span class="p">:</span> | 
|  | <span class="c1"># auto batched or unlimited</span> | 
|  | <span class="n">batchSize</span> <span class="o">=</span> <span class="mi">100</span> | 
|  | <span class="n">other</span> <span class="o">=</span> <span class="n">batch_as</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">batchSize</span><span class="p">)</span> | 
|  | <span class="bp">self</span> <span class="o">=</span> <span class="n">batch_as</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">batchSize</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">():</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Can only zip with RDD which has the same number of partitions"</span><span class="p">)</span> | 
|  |  | 
|  | <span class="c1"># There will be an Exception in JVM if there are different number</span> | 
|  | <span class="c1"># of items in each partitions.</span> | 
|  | <span class="n">pairRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">zip</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">)</span> | 
|  | <span class="n">deserializer</span> <span class="o">=</span> <span class="n">PairDeserializer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">RDD</span><span class="p">(</span><span class="n">pairRDD</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="n">deserializer</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.zipWithIndex"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.zipWithIndex.html#pyspark.RDD.zipWithIndex">[docs]</a>    <span class="k">def</span> <span class="nf">zipWithIndex</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[T, int]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Zips this RDD with its element indices.</span> | 
|  |  | 
|  | <span class="sd">        The ordering is first based on the partition index and then the</span> | 
|  | <span class="sd">        ordering of items within each partition. So the first item in</span> | 
|  | <span class="sd">        the first partition gets index 0, and the last item in the last</span> | 
|  | <span class="sd">        partition receives the largest index.</span> | 
|  |  | 
|  | <span class="sd">        This method needs to trigger a spark job when this RDD contains</span> | 
|  | <span class="sd">        more than one partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the zipped key-index pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.zip`</span> | 
|  | <span class="sd">        :meth:`RDD.zipWithUniqueId`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()</span> | 
|  | <span class="sd">        [('a', 0), ('b', 1), ('c', 2), ('d', 3)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">starts</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> | 
|  | <span class="n">nums</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">it</span><span class="p">:</span> <span class="p">[</span><span class="nb">sum</span><span class="p">(</span><span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">it</span><span class="p">)])</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> | 
|  | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">nums</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span> | 
|  | <span class="n">starts</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">starts</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">nums</span><span class="p">[</span><span class="n">i</span><span class="p">])</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">it</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]:</span> | 
|  | <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">it</span><span class="p">,</span> <span class="n">starts</span><span class="p">[</span><span class="n">k</span><span class="p">]):</span> | 
|  | <span class="k">yield</span> <span class="n">v</span><span class="p">,</span> <span class="n">i</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.zipWithUniqueId"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.zipWithUniqueId.html#pyspark.RDD.zipWithUniqueId">[docs]</a>    <span class="k">def</span> <span class="nf">zipWithUniqueId</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Tuple[T, int]]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Zips this RDD with generated unique Long ids.</span> | 
|  |  | 
|  | <span class="sd">        Items in the kth partition will get ids k, n+k, 2*n+k, ..., where</span> | 
|  | <span class="sd">        n is the number of partitions. So there may exist gaps, but this</span> | 
|  | <span class="sd">        method won't trigger a spark job, which is different from</span> | 
|  | <span class="sd">        :meth:`zipWithIndex`.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a :class:`RDD` containing the zipped key-UniqueId pairs</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.zip`</span> | 
|  | <span class="sd">        :meth:`RDD.zipWithIndex`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()</span> | 
|  | <span class="sd">        [('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">n</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">k</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">it</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]:</span> | 
|  | <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">it</span><span class="p">):</span> | 
|  | <span class="k">yield</span> <span class="n">v</span><span class="p">,</span> <span class="n">i</span> <span class="o">*</span> <span class="n">n</span> <span class="o">+</span> <span class="n">k</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitionsWithIndex</span><span class="p">(</span><span class="n">func</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.name"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.name.html#pyspark.RDD.name">[docs]</a>    <span class="k">def</span> <span class="nf">name</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the name of this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        str</span> | 
|  | <span class="sd">            :class:`RDD` name</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.setName`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.name() == None</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">n</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">name</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">n</span> <span class="k">if</span> <span class="n">n</span> <span class="k">else</span> <span class="kc">None</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.setName"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.setName.html#pyspark.RDD.setName">[docs]</a>    <span class="k">def</span> <span class="nf">setName</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Assign a name to this RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        name : str</span> | 
|  | <span class="sd">            new name</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            the same :class:`RDD` with name updated</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.name`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2])</span> | 
|  | <span class="sd">        >>> rdd.setName('I am an RDD').name()</span> | 
|  | <span class="sd">        'I am an RDD'</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">setName</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.toDebugString"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.toDebugString.html#pyspark.RDD.toDebugString">[docs]</a>    <span class="k">def</span> <span class="nf">toDebugString</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        A description of this RDD and its recursive dependencies for debugging.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        bytes</span> | 
|  | <span class="sd">            debugging information of this :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.range(5)</span> | 
|  | <span class="sd">        >>> rdd.toDebugString()</span> | 
|  | <span class="sd">        b'...PythonRDD...ParallelCollectionRDD...'</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">debug_string</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">toDebugString</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">debug_string</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span> <span class="k">if</span> <span class="n">debug_string</span> <span class="k">else</span> <span class="kc">None</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.getStorageLevel"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.getStorageLevel.html#pyspark.RDD.getStorageLevel">[docs]</a>    <span class="k">def</span> <span class="nf">getStorageLevel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">StorageLevel</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Get the RDD's current storage level.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.0.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`StorageLevel`</span> | 
|  | <span class="sd">            current :class:`StorageLevel`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.name`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1,2])</span> | 
|  | <span class="sd">        >>> rdd.getStorageLevel()</span> | 
|  | <span class="sd">        StorageLevel(False, False, False, False, 1)</span> | 
|  | <span class="sd">        >>> print(rdd.getStorageLevel())</span> | 
|  | <span class="sd">        Serialized 1x Replicated</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">java_storage_level</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">getStorageLevel</span><span class="p">()</span> | 
|  | <span class="n">storage_level</span> <span class="o">=</span> <span class="n">StorageLevel</span><span class="p">(</span> | 
|  | <span class="n">java_storage_level</span><span class="o">.</span><span class="n">useDisk</span><span class="p">(),</span> | 
|  | <span class="n">java_storage_level</span><span class="o">.</span><span class="n">useMemory</span><span class="p">(),</span> | 
|  | <span class="n">java_storage_level</span><span class="o">.</span><span class="n">useOffHeap</span><span class="p">(),</span> | 
|  | <span class="n">java_storage_level</span><span class="o">.</span><span class="n">deserialized</span><span class="p">(),</span> | 
|  | <span class="n">java_storage_level</span><span class="o">.</span><span class="n">replication</span><span class="p">(),</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">storage_level</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_defaultReducePartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns the default number of partitions to use during reduce tasks (e.g., groupBy).</span> | 
|  | <span class="sd">        If spark.default.parallelism is set, then we'll use the value from SparkContext</span> | 
|  | <span class="sd">        defaultParallelism, otherwise we'll use the number of partitions in this RDD.</span> | 
|  |  | 
|  | <span class="sd">        This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce</span> | 
|  | <span class="sd">        the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will</span> | 
|  | <span class="sd">        be inherent.</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="s2">"spark.default.parallelism"</span><span class="p">):</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">defaultParallelism</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getNumPartitions</span><span class="p">()</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.lookup"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.lookup.html#pyspark.RDD.lookup">[docs]</a>    <span class="k">def</span> <span class="nf">lookup</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Tuple[K, V]]"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">K</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">V</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return the list of values in the RDD for key `key`. This operation</span> | 
|  | <span class="sd">        is done efficiently if the RDD has a known partitioner by only</span> | 
|  | <span class="sd">        searching the partition that the key maps to.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        key : K</span> | 
|  | <span class="sd">            the key to look up</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        list</span> | 
|  | <span class="sd">            the list of values in the :class:`RDD` for key `key`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> l = range(1000)</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(zip(l, l), 10)</span> | 
|  | <span class="sd">        >>> rdd.lookup(42)  # slow</span> | 
|  | <span class="sd">        [42]</span> | 
|  | <span class="sd">        >>> sorted = rdd.sortByKey()</span> | 
|  | <span class="sd">        >>> sorted.lookup(42)  # fast</span> | 
|  | <span class="sd">        [42]</span> | 
|  | <span class="sd">        >>> sorted.lookup(1024)</span> | 
|  | <span class="sd">        []</span> | 
|  | <span class="sd">        >>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey()</span> | 
|  | <span class="sd">        >>> list(rdd2.lookup(('a', 'b'))[0])</span> | 
|  | <span class="sd">        ['c']</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">values</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">kv</span><span class="p">:</span> <span class="n">kv</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="n">values</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">runJob</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span><span class="p">(</span><span class="n">key</span><span class="p">)])</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">values</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_to_java_object_rdd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"JavaObject"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""Return a JavaRDD of Object by unpickling</span> | 
|  |  | 
|  | <span class="sd">        It will convert each Python object into Java object by Pickle, whenever the</span> | 
|  | <span class="sd">        RDD is serialized in batch or not.</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">rdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pickled</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SerDeUtil</span><span class="o">.</span><span class="n">pythonToJava</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">_jrdd</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.countApprox"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.countApprox.html#pyspark.RDD.countApprox">[docs]</a>    <span class="k">def</span> <span class="nf">countApprox</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">confidence</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Approximate version of count() that returns a potentially incomplete</span> | 
|  | <span class="sd">        result within a timeout, even if not all tasks have finished.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        timeout : int</span> | 
|  | <span class="sd">            maximum time to wait for the job, in milliseconds</span> | 
|  | <span class="sd">        confidence : float</span> | 
|  | <span class="sd">            the desired statistical confidence in the result</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        int</span> | 
|  | <span class="sd">            a potentially incomplete result, with error bounds</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.count`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(1000), 10)</span> | 
|  | <span class="sd">        >>> rdd.countApprox(1000, 1.0)</span> | 
|  | <span class="sd">        1000</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">drdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">it</span><span class="p">:</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">it</span><span class="p">))])</span> | 
|  | <span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">drdd</span><span class="o">.</span><span class="n">sumApprox</span><span class="p">(</span><span class="n">timeout</span><span class="p">,</span> <span class="n">confidence</span><span class="p">))</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.sumApprox"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.sumApprox.html#pyspark.RDD.sumApprox">[docs]</a>    <span class="k">def</span> <span class="nf">sumApprox</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Union[float, int]]"</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">confidence</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">BoundedFloat</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Approximate operation to return the sum within a timeout</span> | 
|  | <span class="sd">        or meet the confidence.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        timeout : int</span> | 
|  | <span class="sd">            maximum time to wait for the job, in milliseconds</span> | 
|  | <span class="sd">        confidence : float</span> | 
|  | <span class="sd">            the desired statistical confidence in the result</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`BoundedFloat`</span> | 
|  | <span class="sd">            a potentially incomplete result, with error bounds</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.sum`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(1000), 10)</span> | 
|  | <span class="sd">        >>> r = sum(range(1000))</span> | 
|  | <span class="sd">        >>> abs(rdd.sumApprox(1000) - r) / r < 0.05</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapPartitions</span><span class="p">(</span><span class="k">lambda</span> <span class="n">it</span><span class="p">:</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">it</span><span class="p">))])</span><span class="o">.</span><span class="n">_to_java_object_rdd</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="n">jdrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">JavaDoubleRDD</span><span class="o">.</span><span class="n">fromRDD</span><span class="p">(</span><span class="n">jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">())</span> | 
|  | <span class="n">r</span> <span class="o">=</span> <span class="n">jdrdd</span><span class="o">.</span><span class="n">sumApprox</span><span class="p">(</span><span class="n">timeout</span><span class="p">,</span> <span class="n">confidence</span><span class="p">)</span><span class="o">.</span><span class="n">getFinalValue</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">BoundedFloat</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">mean</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">confidence</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">low</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">high</span><span class="p">())</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.meanApprox"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.meanApprox.html#pyspark.RDD.meanApprox">[docs]</a>    <span class="k">def</span> <span class="nf">meanApprox</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Union[float, int]]"</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">confidence</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">BoundedFloat</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Approximate operation to return the mean within a timeout</span> | 
|  | <span class="sd">        or meet the confidence.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        timeout : int</span> | 
|  | <span class="sd">            maximum time to wait for the job, in milliseconds</span> | 
|  | <span class="sd">        confidence : float</span> | 
|  | <span class="sd">            the desired statistical confidence in the result</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`BoundedFloat`</span> | 
|  | <span class="sd">            a potentially incomplete result, with error bounds</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.mean`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(1000), 10)</span> | 
|  | <span class="sd">        >>> r = sum(range(1000)) / 1000.0</span> | 
|  | <span class="sd">        >>> abs(rdd.meanApprox(1000) - r) / r < 0.05</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">_to_java_object_rdd</span><span class="p">()</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="n">jdrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">JavaDoubleRDD</span><span class="o">.</span><span class="n">fromRDD</span><span class="p">(</span><span class="n">jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">())</span> | 
|  | <span class="n">r</span> <span class="o">=</span> <span class="n">jdrdd</span><span class="o">.</span><span class="n">meanApprox</span><span class="p">(</span><span class="n">timeout</span><span class="p">,</span> <span class="n">confidence</span><span class="p">)</span><span class="o">.</span><span class="n">getFinalValue</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">BoundedFloat</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">mean</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">confidence</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">low</span><span class="p">(),</span> <span class="n">r</span><span class="o">.</span><span class="n">high</span><span class="p">())</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.countApproxDistinct"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.countApproxDistinct.html#pyspark.RDD.countApproxDistinct">[docs]</a>    <span class="k">def</span> <span class="nf">countApproxDistinct</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">relativeSD</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return approximate number of distinct elements in the RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.2.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        relativeSD : float, optional</span> | 
|  | <span class="sd">            Relative accuracy. Smaller values create</span> | 
|  | <span class="sd">            counters that require more space.</span> | 
|  | <span class="sd">            It must be greater than 0.000017.</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        int</span> | 
|  | <span class="sd">            approximate number of distinct elements</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.distinct`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        The algorithm used is based on streamlib's implementation of</span> | 
|  | <span class="sd">        `"HyperLogLog in Practice: Algorithmic Engineering of a State</span> | 
|  | <span class="sd">        of The Art Cardinality Estimation Algorithm", available here</span> | 
|  | <span class="sd">        <https://doi.org/10.1145/2452376.2452456>`_.</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()</span> | 
|  | <span class="sd">        >>> 900 < n < 1100</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        >>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()</span> | 
|  | <span class="sd">        >>> 16 < n < 24</span> | 
|  | <span class="sd">        True</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">if</span> <span class="n">relativeSD</span> <span class="o"><</span> <span class="mf">0.000017</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"relativeSD should be greater than 0.000017"</span><span class="p">)</span> | 
|  | <span class="c1"># the hash space in Java is 2^32</span> | 
|  | <span class="n">hashRDD</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">portable_hash</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">&</span> <span class="mh">0xFFFFFFFF</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">hashRDD</span><span class="o">.</span><span class="n">_to_java_object_rdd</span><span class="p">()</span><span class="o">.</span><span class="n">countApproxDistinct</span><span class="p">(</span><span class="n">relativeSD</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.toLocalIterator"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.toLocalIterator.html#pyspark.RDD.toLocalIterator">[docs]</a>    <span class="k">def</span> <span class="nf">toLocalIterator</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">prefetchPartitions</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Return an iterator that contains all of the elements in this RDD.</span> | 
|  | <span class="sd">        The iterator will consume as much memory as the largest partition in this RDD.</span> | 
|  | <span class="sd">        With prefetch it may consume up to the memory of the 2 largest partitions.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 1.3.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        prefetchPartitions : bool, optional</span> | 
|  | <span class="sd">            If Spark should pre-fetch the next partition</span> | 
|  | <span class="sd">            before it is needed.</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`collections.abc.Iterator`</span> | 
|  | <span class="sd">            an iterator that contains all of the elements in this :class:`RDD`</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.collect`</span> | 
|  | <span class="sd">        :meth:`pyspark.sql.DataFrame.toLocalIterator`</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize(range(10))</span> | 
|  | <span class="sd">        >>> [x for x in rdd.toLocalIterator()]</span> | 
|  | <span class="sd">        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="k">with</span> <span class="n">SCCallSiteSync</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">context</span><span class="p">):</span> | 
|  | <span class="n">sock_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="o">.</span><span class="n">toLocalIteratorAndServe</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">(),</span> <span class="n">prefetchPartitions</span> | 
|  | <span class="p">)</span> | 
|  | <span class="k">return</span> <span class="n">_local_iterator_from_socket</span><span class="p">(</span><span class="n">sock_info</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.barrier"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.barrier.html#pyspark.RDD.barrier">[docs]</a>    <span class="k">def</span> <span class="nf">barrier</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDDBarrier[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Marks the current stage as a barrier stage, where Spark must launch all tasks together.</span> | 
|  | <span class="sd">        In case of a task failure, instead of only restarting the failed task, Spark will abort the</span> | 
|  | <span class="sd">        entire stage and relaunch all tasks for this stage.</span> | 
|  | <span class="sd">        The barrier execution mode feature is experimental and it only handles limited scenarios.</span> | 
|  | <span class="sd">        Please read the linked SPIP and design docs to understand the limitations and future plans.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 2.4.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDDBarrier`</span> | 
|  | <span class="sd">            instance that provides actions within a barrier stage.</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :class:`pyspark.BarrierTaskContext`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        For additional information see</span> | 
|  |  | 
|  | <span class="sd">        - `SPIP: Barrier Execution Mode <https://issues.apache.org/jira/browse/SPARK-24374>`_</span> | 
|  | <span class="sd">        - `Design Doc <https://issues.apache.org/jira/browse/SPARK-24582>`_</span> | 
|  |  | 
|  | <span class="sd">        This API is experimental</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">RDDBarrier</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span></div> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_is_barrier</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Whether this RDD is in a barrier stage.</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">()</span><span class="o">.</span><span class="n">isBarrier</span><span class="p">()</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.withResources"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.withResources.html#pyspark.RDD.withResources">[docs]</a>    <span class="k">def</span> <span class="nf">withResources</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[T]"</span><span class="p">,</span> <span class="n">profile</span><span class="p">:</span> <span class="n">ResourceProfile</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[T]"</span><span class="p">:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Specify a :class:`pyspark.resource.ResourceProfile` to use when calculating this RDD.</span> | 
|  | <span class="sd">        This is only supported on certain cluster managers and currently requires dynamic</span> | 
|  | <span class="sd">        allocation to be enabled. It will result in new executors with the resources specified</span> | 
|  | <span class="sd">        being acquired to calculate the RDD.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 3.1.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        profile : :class:`pyspark.resource.ResourceProfile`</span> | 
|  | <span class="sd">            a resource profile</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            the same :class:`RDD` with user specified profile</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.getResourceProfile`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This API is experimental</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">has_resource_profile</span> <span class="o">=</span> <span class="kc">True</span> | 
|  | <span class="k">if</span> <span class="n">profile</span><span class="o">.</span><span class="n">_java_resource_profile</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="n">jrp</span> <span class="o">=</span> <span class="n">profile</span><span class="o">.</span><span class="n">_java_resource_profile</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="n">builder</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">resource</span><span class="o">.</span><span class="n">ResourceProfileBuilder</span><span class="p">()</span> | 
|  | <span class="n">ereqs</span> <span class="o">=</span> <span class="n">ExecutorResourceRequests</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="n">profile</span><span class="o">.</span><span class="n">_executor_resource_requests</span><span class="p">)</span> | 
|  | <span class="n">treqs</span> <span class="o">=</span> <span class="n">TaskResourceRequests</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="n">profile</span><span class="o">.</span><span class="n">_task_resource_requests</span><span class="p">)</span> | 
|  | <span class="n">builder</span><span class="o">.</span><span class="n">require</span><span class="p">(</span><span class="n">ereqs</span><span class="o">.</span><span class="n">_java_executor_resource_requests</span><span class="p">)</span> | 
|  | <span class="n">builder</span><span class="o">.</span><span class="n">require</span><span class="p">(</span><span class="n">treqs</span><span class="o">.</span><span class="n">_java_task_resource_requests</span><span class="p">)</span> | 
|  | <span class="n">jrp</span> <span class="o">=</span> <span class="n">builder</span><span class="o">.</span><span class="n">build</span><span class="p">()</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">withResources</span><span class="p">(</span><span class="n">jrp</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDD.getResourceProfile"><a class="viewcode-back" href="../../reference/api/pyspark.RDD.getResourceProfile.html#pyspark.RDD.getResourceProfile">[docs]</a>    <span class="k">def</span> <span class="nf">getResourceProfile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">ResourceProfile</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Get the :class:`pyspark.resource.ResourceProfile` specified with this RDD or None</span> | 
|  | <span class="sd">        if it wasn't specified.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 3.1.0</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        class:`pyspark.resource.ResourceProfile`</span> | 
|  | <span class="sd">            The user specified profile or None if none were specified</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.withResources`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This API is experimental</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="n">rp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">getResourceProfile</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="n">rp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="n">ResourceProfile</span><span class="p">(</span><span class="n">_java_resource_profile</span><span class="o">=</span><span class="n">rp</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="kc">None</span></div> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[RowLike]"</span><span class="p">,</span> | 
|  | <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="n">sampleRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[RowLike]"</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"StructType"</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="nd">@overload</span> | 
|  | <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[AtomicValue]"</span><span class="p">,</span> | 
|  | <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"AtomicType"</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> | 
|  | <span class="o">...</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">:</span> <span class="s2">"RDD[Any]"</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">sampleRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> | 
|  | <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> | 
|  | <span class="n">error_class</span><span class="o">=</span><span class="s2">"CALL_BEFORE_INITIALIZE"</span><span class="p">,</span> | 
|  | <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span> | 
|  | <span class="s2">"func_name"</span><span class="p">:</span> <span class="s2">"RDD.toDF"</span><span class="p">,</span> | 
|  | <span class="s2">"object"</span><span class="p">:</span> <span class="s2">"SparkSession"</span><span class="p">,</span> | 
|  | <span class="p">},</span> | 
|  | <span class="p">)</span></div> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_prepare_for_python_RDD</span><span class="p">(</span><span class="n">sc</span><span class="p">:</span> <span class="s2">"SparkContext"</span><span class="p">,</span> <span class="n">command</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">bytes</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> | 
|  | <span class="c1"># the serialized command will be compressed by broadcast</span> | 
|  | <span class="n">ser</span> <span class="o">=</span> <span class="n">CloudPickleSerializer</span><span class="p">()</span> | 
|  | <span class="n">pickled_command</span> <span class="o">=</span> <span class="n">ser</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">command</span><span class="p">)</span> | 
|  | <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pickled_command</span><span class="p">)</span> <span class="o">></span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonUtils</span><span class="o">.</span><span class="n">getBroadcastThreshold</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jsc</span><span class="p">):</span>  <span class="c1"># Default 1M</span> | 
|  | <span class="c1"># The broadcast will have same life cycle as created PythonRDD</span> | 
|  | <span class="n">broadcast</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">pickled_command</span><span class="p">)</span> | 
|  | <span class="n">pickled_command</span> <span class="o">=</span> <span class="n">ser</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">broadcast</span><span class="p">)</span> | 
|  | <span class="n">broadcast_vars</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span><span class="o">.</span><span class="n">_jbroadcast</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">sc</span><span class="o">.</span><span class="n">_pickled_broadcast_vars</span><span class="p">]</span> | 
|  | <span class="n">sc</span><span class="o">.</span><span class="n">_pickled_broadcast_vars</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="n">pickled_command</span><span class="p">,</span> <span class="n">broadcast_vars</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">environment</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_python_includes</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_wrap_function</span><span class="p">(</span> | 
|  | <span class="n">sc</span><span class="p">:</span> <span class="s2">"SparkContext"</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">deserializer</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">serializer</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">profiler</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="s2">"JavaObject"</span><span class="p">:</span> | 
|  | <span class="k">assert</span> <span class="n">deserializer</span><span class="p">,</span> <span class="s2">"deserializer should not be empty"</span> | 
|  | <span class="k">assert</span> <span class="n">serializer</span><span class="p">,</span> <span class="s2">"serializer should not be empty"</span> | 
|  | <span class="n">command</span> <span class="o">=</span> <span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">profiler</span><span class="p">,</span> <span class="n">deserializer</span><span class="p">,</span> <span class="n">serializer</span><span class="p">)</span> | 
|  | <span class="n">pickled_command</span><span class="p">,</span> <span class="n">broadcast_vars</span><span class="p">,</span> <span class="n">env</span><span class="p">,</span> <span class="n">includes</span> <span class="o">=</span> <span class="n">_prepare_for_python_RDD</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">command</span><span class="p">)</span> | 
|  | <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="k">return</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SimplePythonFunction</span><span class="p">(</span> | 
|  | <span class="nb">bytearray</span><span class="p">(</span><span class="n">pickled_command</span><span class="p">),</span> | 
|  | <span class="n">env</span><span class="p">,</span> | 
|  | <span class="n">includes</span><span class="p">,</span> | 
|  | <span class="n">sc</span><span class="o">.</span><span class="n">pythonExec</span><span class="p">,</span> | 
|  | <span class="n">sc</span><span class="o">.</span><span class="n">pythonVer</span><span class="p">,</span> | 
|  | <span class="n">broadcast_vars</span><span class="p">,</span> | 
|  | <span class="n">sc</span><span class="o">.</span><span class="n">_javaAccumulator</span><span class="p">,</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  |  | 
|  | <div class="viewcode-block" id="RDDBarrier"><a class="viewcode-back" href="../../reference/api/pyspark.RDDBarrier.html#pyspark.RDDBarrier">[docs]</a><span class="k">class</span> <span class="nc">RDDBarrier</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">T</span><span class="p">]):</span> | 
|  |  | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Wraps an RDD in a barrier stage, which forces Spark to launch tasks of this stage together.</span> | 
|  | <span class="sd">    :class:`RDDBarrier` instances are created by :meth:`RDD.barrier`.</span> | 
|  |  | 
|  | <span class="sd">    .. versionadded:: 2.4.0</span> | 
|  |  | 
|  | <span class="sd">    Notes</span> | 
|  | <span class="sd">    -----</span> | 
|  | <span class="sd">    This API is experimental</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rdd</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="n">T</span><span class="p">]):</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">rdd</span> <span class="o">=</span> <span class="n">rdd</span> | 
|  |  | 
|  | <div class="viewcode-block" id="RDDBarrier.mapPartitions"><a class="viewcode-back" href="../../reference/api/pyspark.RDDBarrier.mapPartitions.html#pyspark.RDDBarrier.mapPartitions">[docs]</a>    <span class="k">def</span> <span class="nf">mapPartitions</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns a new RDD by applying a function to each partition of the wrapped RDD,</span> | 
|  | <span class="sd">        where tasks are launched together in a barrier stage.</span> | 
|  | <span class="sd">        The interface is the same as :meth:`RDD.mapPartitions`.</span> | 
|  | <span class="sd">        Please see the API doc there.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 2.4.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">           a function to run on each partition of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitions`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This API is experimental</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 2)</span> | 
|  | <span class="sd">        >>> def f(iterator): yield sum(iterator)</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> barrier = rdd.barrier()</span> | 
|  | <span class="sd">        >>> barrier</span> | 
|  | <span class="sd">        <pyspark.rdd.RDDBarrier ...></span> | 
|  | <span class="sd">        >>> barrier.mapPartitions(f).collect()</span> | 
|  | <span class="sd">        [3, 7]</span> | 
|  | <span class="sd">        """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">iterator</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">return</span> <span class="n">PipelinedRDD</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">rdd</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">,</span> <span class="n">isFromBarrier</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> | 
|  |  | 
|  | <div class="viewcode-block" id="RDDBarrier.mapPartitionsWithIndex"><a class="viewcode-back" href="../../reference/api/pyspark.RDDBarrier.mapPartitionsWithIndex.html#pyspark.RDDBarrier.mapPartitionsWithIndex">[docs]</a>    <span class="k">def</span> <span class="nf">mapPartitionsWithIndex</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">,</span> | 
|  | <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> | 
|  | <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> | 
|  | <span class="p">)</span> <span class="o">-></span> <span class="n">RDD</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="w">        </span><span class="sd">"""</span> | 
|  | <span class="sd">        Returns a new RDD by applying a function to each partition of the wrapped RDD, while</span> | 
|  | <span class="sd">        tracking the index of the original partition. And all tasks are launched together</span> | 
|  | <span class="sd">        in a barrier stage.</span> | 
|  | <span class="sd">        The interface is the same as :meth:`RDD.mapPartitionsWithIndex`.</span> | 
|  | <span class="sd">        Please see the API doc there.</span> | 
|  |  | 
|  | <span class="sd">        .. versionadded:: 3.0.0</span> | 
|  |  | 
|  | <span class="sd">        Parameters</span> | 
|  | <span class="sd">        ----------</span> | 
|  | <span class="sd">        f : function</span> | 
|  | <span class="sd">           a function to run on each partition of the RDD</span> | 
|  | <span class="sd">        preservesPartitioning : bool, optional, default False</span> | 
|  | <span class="sd">            indicates whether the input function preserves the partitioner,</span> | 
|  | <span class="sd">            which should be False unless this is a pair RDD and the input</span> | 
|  |  | 
|  | <span class="sd">        Returns</span> | 
|  | <span class="sd">        -------</span> | 
|  | <span class="sd">        :class:`RDD`</span> | 
|  | <span class="sd">            a new :class:`RDD` by applying a function to each partition</span> | 
|  |  | 
|  | <span class="sd">        See Also</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        :meth:`RDD.mapPartitionsWithIndex`</span> | 
|  |  | 
|  | <span class="sd">        Notes</span> | 
|  | <span class="sd">        -----</span> | 
|  | <span class="sd">        This API is experimental</span> | 
|  |  | 
|  | <span class="sd">        Examples</span> | 
|  | <span class="sd">        --------</span> | 
|  | <span class="sd">        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)</span> | 
|  | <span class="sd">        >>> def f(splitIndex, iterator): yield splitIndex</span> | 
|  | <span class="sd">        ...</span> | 
|  | <span class="sd">        >>> barrier = rdd.barrier()</span> | 
|  | <span class="sd">        >>> barrier</span> | 
|  | <span class="sd">        <pyspark.rdd.RDDBarrier ...></span> | 
|  | <span class="sd">        >>> barrier.mapPartitionsWithIndex(f).sum()</span> | 
|  | <span class="sd">        6</span> | 
|  | <span class="sd">        """</span> | 
|  | <span class="k">return</span> <span class="n">PipelinedRDD</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">rdd</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">preservesPartitioning</span><span class="p">,</span> <span class="n">isFromBarrier</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div></div> | 
|  |  | 
|  |  | 
|  | <span class="k">class</span> <span class="nc">PipelinedRDD</span><span class="p">(</span><span class="n">RDD</span><span class="p">[</span><span class="n">U</span><span class="p">],</span> <span class="n">Generic</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="n">U</span><span class="p">]):</span> | 
|  |  | 
|  | <span class="w">    </span><span class="sd">"""</span> | 
|  | <span class="sd">    Examples</span> | 
|  | <span class="sd">    --------</span> | 
|  | <span class="sd">    Pipelined maps:</span> | 
|  |  | 
|  | <span class="sd">    >>> rdd = sc.parallelize([1, 2, 3, 4])</span> | 
|  | <span class="sd">    >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()</span> | 
|  | <span class="sd">    [4, 8, 12, 16]</span> | 
|  | <span class="sd">    >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect()</span> | 
|  | <span class="sd">    [4, 8, 12, 16]</span> | 
|  |  | 
|  | <span class="sd">    Pipelined reduces:</span> | 
|  |  | 
|  | <span class="sd">    >>> from operator import add</span> | 
|  | <span class="sd">    >>> rdd.map(lambda x: 2 * x).reduce(add)</span> | 
|  | <span class="sd">    20</span> | 
|  | <span class="sd">    >>> rdd.flatMap(lambda x: [x, x]).reduce(add)</span> | 
|  | <span class="sd">    20</span> | 
|  | <span class="sd">    """</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="p">,</span> | 
|  | <span class="n">prev</span><span class="p">:</span> <span class="n">RDD</span><span class="p">[</span><span class="n">T</span><span class="p">],</span> | 
|  | <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]],</span> | 
|  | <span class="n">preservesPartitioning</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> | 
|  | <span class="n">isFromBarrier</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> | 
|  | <span class="p">):</span> | 
|  | <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prev</span><span class="p">,</span> <span class="n">PipelinedRDD</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">prev</span><span class="o">.</span><span class="n">_is_pipelinable</span><span class="p">():</span> | 
|  | <span class="c1"># This transformation is the first in its stage:</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">func</span> <span class="o">=</span> <span class="n">func</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">preservesPartitioning</span> <span class="o">=</span> <span class="n">preservesPartitioning</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">_jrdd</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd_deserializer</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">prev_func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">V</span><span class="p">]],</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">T</span><span class="p">]]</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">func</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">pipeline_func</span><span class="p">(</span><span class="n">split</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">iterator</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">V</span><span class="p">])</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">U</span><span class="p">]:</span> | 
|  | <span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="n">split</span><span class="p">,</span> <span class="n">prev_func</span><span class="p">(</span><span class="n">split</span><span class="p">,</span> <span class="n">iterator</span><span class="p">))</span> | 
|  |  | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">func</span> <span class="o">=</span> <span class="n">pipeline_func</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">preservesPartitioning</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">preservesPartitioning</span> <span class="ow">and</span> <span class="n">preservesPartitioning</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">_prev_jrdd</span>  <span class="c1"># maintain the pipeline</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd_deserializer</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">_prev_jrdd_deserializer</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">has_resource_profile</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_checkpointed</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">ctx</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">prev</span> <span class="o">=</span> <span class="n">prev</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"JavaObject"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> <span class="o">=</span> <span class="kc">None</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">serializer</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_bypass_serializer</span> <span class="o">=</span> <span class="kc">False</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">partitioner</span> <span class="o">=</span> <span class="n">prev</span><span class="o">.</span><span class="n">partitioner</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">preservesPartitioning</span> <span class="k">else</span> <span class="kc">None</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">is_barrier</span> <span class="o">=</span> <span class="n">isFromBarrier</span> <span class="ow">or</span> <span class="n">prev</span><span class="o">.</span><span class="n">_is_barrier</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd</span><span class="o">.</span><span class="n">partitions</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span> | 
|  |  | 
|  | <span class="nd">@property</span> | 
|  | <span class="k">def</span> <span class="nf">_jrdd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"JavaObject"</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_bypass_serializer</span><span class="p">:</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span> <span class="o">=</span> <span class="n">NoOpSerializer</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">profiler_collector</span> | 
|  | <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.python.profile"</span><span class="p">,</span> <span class="s2">"false"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"true"</span> | 
|  | <span class="p">):</span> | 
|  | <span class="n">profiler</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">profiler_collector</span><span class="o">.</span><span class="n">new_profiler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">)</span> | 
|  | <span class="k">else</span><span class="p">:</span> | 
|  | <span class="n">profiler</span> <span class="o">=</span> <span class="kc">None</span> | 
|  |  | 
|  | <span class="n">wrapped_func</span> <span class="o">=</span> <span class="n">_wrap_function</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">func</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd_deserializer</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_deserializer</span><span class="p">,</span> <span class="n">profiler</span> | 
|  | <span class="p">)</span> | 
|  |  | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="n">python_rdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonRDD</span><span class="p">(</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_prev_jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">(),</span> <span class="n">wrapped_func</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">preservesPartitioning</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_barrier</span> | 
|  | <span class="p">)</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span> <span class="o">=</span> <span class="n">python_rdd</span><span class="o">.</span><span class="n">asJavaRDD</span><span class="p">()</span> | 
|  |  | 
|  | <span class="k">if</span> <span class="n">profiler</span><span class="p">:</span> | 
|  | <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span><span class="o">.</span><span class="n">id</span><span class="p">()</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">ctx</span><span class="o">.</span><span class="n">profiler_collector</span><span class="o">.</span><span class="n">add_profiler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_id</span><span class="p">,</span> <span class="n">profiler</span><span class="p">)</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd_val</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> | 
|  | <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jrdd</span><span class="o">.</span><span class="n">id</span><span class="p">()</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_id</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_is_pipelinable</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="ow">not</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_checkpointed</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_resource_profile</span><span class="p">)</span> | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_is_barrier</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> | 
|  | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_barrier</span> | 
|  |  | 
|  |  | 
|  | <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> | 
|  | <span class="kn">import</span> <span class="nn">doctest</span> | 
|  | <span class="kn">import</span> <span class="nn">tempfile</span> | 
|  | <span class="kn">from</span> <span class="nn">pyspark.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> | 
|  |  | 
|  | <span class="n">tmp_dir</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">()</span> | 
|  | <span class="n">globs</span> <span class="o">=</span> <span class="nb">globals</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> | 
|  | <span class="c1"># The small batch size here ensures that we see multiple batches,</span> | 
|  | <span class="c1"># even in these small test examples:</span> | 
|  | <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">,</span> <span class="s2">"PythonTest"</span><span class="p">)</span> | 
|  | <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span><span class="o">.</span><span class="n">setCheckpointDir</span><span class="p">(</span><span class="n">tmp_dir</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> | 
|  | <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span> | 
|  | <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> | 
|  | <span class="n">tmp_dir</span><span class="o">.</span><span class="n">cleanup</span><span class="p">()</span> | 
|  | <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> | 
|  | <span class="n">tmp_dir</span><span class="o">.</span><span class="n">cleanup</span><span class="p">()</span> | 
|  | <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> | 
|  |  | 
|  |  | 
|  | <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> | 
|  | <span class="n">_test</span><span class="p">()</span> | 
|  | </pre></div> | 
|  |  | 
|  | </div> | 
|  |  | 
|  |  | 
|  | <!-- Previous / next buttons --> | 
|  | <div class='prev-next-area'> | 
|  | </div> | 
|  |  | 
|  | </main> | 
|  |  | 
|  |  | 
|  | </div> | 
|  | </div> | 
|  |  | 
|  | <script src="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script> | 
|  | <footer class="footer mt-5 mt-md-0"> | 
|  | <div class="container"> | 
|  |  | 
|  | <div class="footer-item"> | 
|  | <p class="copyright"> | 
|  | © Copyright .<br> | 
|  | </p> | 
|  | </div> | 
|  |  | 
|  | <div class="footer-item"> | 
|  | <p class="sphinx-version"> | 
|  | Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br> | 
|  | </p> | 
|  | </div> | 
|  |  | 
|  | </div> | 
|  | </footer> | 
|  | </body> | 
|  | </html> |