| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>Quickstart: DataFrame — PySpark 3.5.5 documentation</title> |
| |
| <link href="../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="../_static/styles/pydata-sphinx-theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"> |
| |
| <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart: Spark Connect" href="quickstart_connect.html" /> |
| <link rel="prev" title="Installation" href="install.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Google Analytics --> |
| |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <div class="container-fluid" id="banner"></div> |
| |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl"> |
| |
| <div id="navbar-start"> |
| |
| |
| |
| <a class="navbar-brand" href="../index.html"> |
| <img src="../_static/spark-logo-reverse.png" class="logo" alt="logo"> |
| </a> |
| |
| |
| |
| </div> |
| |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| |
| <div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse"> |
| <div id="navbar-center" class="mr-auto"> |
| |
| <div class="navbar-center-item"> |
| <ul id="navbar-main-elements" class="navbar-nav"> |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| <li class="toctree-l1 current active nav-item"> |
| <a class="reference internal nav-link" href="index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| |
| </ul> |
| </div> |
| |
| </div> |
| |
| <div id="navbar-end"> |
| |
| <div class="navbar-end-item"> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 3.5.5 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "getting_started/quickstart_df.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script> |
| </div> |
| |
| </div> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| |
| <!-- Only show if we have sidebars configured, else just a small margin --> |
| <div class="col-12 col-md-3 bd-sidebar"> |
| <div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| <div class="bd-toc-item active"> |
| <ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="install.html"> |
| Installation |
| </a> |
| </li> |
| <li class="toctree-l1 current active"> |
| <a class="current reference internal" href="#"> |
| Quickstart: DataFrame |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="quickstart_connect.html"> |
| Quickstart: Spark Connect |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="quickstart_ps.html"> |
| Quickstart: Pandas API on Spark |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="testing_pyspark.html"> |
| Testing PySpark |
| </a> |
| </li> |
| </ul> |
| |
| </div> |
| </nav> |
| </div> |
| <div class="sidebar-end-items"> |
| </div> |
| </div> |
| |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <div class="toc-item"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#DataFrame-Creation"> |
| DataFrame Creation |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Viewing-Data"> |
| Viewing Data |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Selecting-and-Accessing-Data"> |
| Selecting and Accessing Data |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Applying-a-Function"> |
| Applying a Function |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Grouping-Data"> |
| Grouping Data |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Getting-Data-In/Out"> |
| Getting Data In/Out |
| </a> |
| <ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#CSV"> |
| CSV |
| </a> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Parquet"> |
| Parquet |
| </a> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#ORC"> |
| ORC |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#Working-with-SQL"> |
| Working with SQL |
| </a> |
| </li> |
| </ul> |
| |
| </nav> |
| </div> |
| |
| <div class="toc-item"> |
| |
| </div> |
| |
| |
| </div> |
| |
| |
| |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| |
| <style> |
| /* CSS for nbsphinx extension */ |
| |
| /* remove conflicting styling from Sphinx themes */ |
| div.nbinput.container div.prompt *, |
| div.nboutput.container div.prompt *, |
| div.nbinput.container div.input_area pre, |
| div.nboutput.container div.output_area pre, |
| div.nbinput.container div.input_area .highlight, |
| div.nboutput.container div.output_area .highlight { |
| border: none; |
| padding: 0; |
| margin: 0; |
| box-shadow: none; |
| } |
| |
| div.nbinput.container > div[class*=highlight], |
| div.nboutput.container > div[class*=highlight] { |
| margin: 0; |
| } |
| |
| div.nbinput.container div.prompt *, |
| div.nboutput.container div.prompt * { |
| background: none; |
| } |
| |
| div.nboutput.container div.output_area .highlight, |
| div.nboutput.container div.output_area pre { |
| background: unset; |
| } |
| |
| div.nboutput.container div.output_area div.highlight { |
| color: unset; /* override Pygments text color */ |
| } |
| |
| /* avoid gaps between output lines */ |
| div.nboutput.container div[class*=highlight] pre { |
| line-height: normal; |
| } |
| |
| /* input/output containers */ |
| div.nbinput.container, |
| div.nboutput.container { |
| display: -webkit-flex; |
| display: flex; |
| align-items: flex-start; |
| margin: 0; |
| width: 100%; |
| } |
| @media (max-width: 540px) { |
| div.nbinput.container, |
| div.nboutput.container { |
| flex-direction: column; |
| } |
| } |
| |
| /* input container */ |
| div.nbinput.container { |
| padding-top: 5px; |
| } |
| |
| /* last container */ |
| div.nblast.container { |
| padding-bottom: 5px; |
| } |
| |
| /* input prompt */ |
| div.nbinput.container div.prompt pre { |
| color: #307FC1; |
| } |
| |
| /* output prompt */ |
| div.nboutput.container div.prompt pre { |
| color: #BF5B3D; |
| } |
| |
| /* all prompts */ |
| div.nbinput.container div.prompt, |
| div.nboutput.container div.prompt { |
| width: 4.5ex; |
| padding-top: 5px; |
| position: relative; |
| user-select: none; |
| } |
| |
| div.nbinput.container div.prompt > div, |
| div.nboutput.container div.prompt > div { |
| position: absolute; |
| right: 0; |
| margin-right: 0.3ex; |
| } |
| |
| @media (max-width: 540px) { |
| div.nbinput.container div.prompt, |
| div.nboutput.container div.prompt { |
| width: unset; |
| text-align: left; |
| padding: 0.4em; |
| } |
| div.nboutput.container div.prompt.empty { |
| padding: 0; |
| } |
| |
| div.nbinput.container div.prompt > div, |
| div.nboutput.container div.prompt > div { |
| position: unset; |
| } |
| } |
| |
| /* disable scrollbars on prompts */ |
| div.nbinput.container div.prompt pre, |
| div.nboutput.container div.prompt pre { |
| overflow: hidden; |
| } |
| |
| /* input/output area */ |
| div.nbinput.container div.input_area, |
| div.nboutput.container div.output_area { |
| -webkit-flex: 1; |
| flex: 1; |
| overflow: auto; |
| } |
| @media (max-width: 540px) { |
| div.nbinput.container div.input_area, |
| div.nboutput.container div.output_area { |
| width: 100%; |
| } |
| } |
| |
| /* input area */ |
| div.nbinput.container div.input_area { |
| border: 1px solid #e0e0e0; |
| border-radius: 2px; |
| /*background: #f5f5f5;*/ |
| } |
| |
| /* override MathJax center alignment in output cells */ |
| div.nboutput.container div[class*=MathJax] { |
| text-align: left !important; |
| } |
| |
| /* override sphinx.ext.imgmath center alignment in output cells */ |
| div.nboutput.container div.math p { |
| text-align: left; |
| } |
| |
| /* standard error */ |
| div.nboutput.container div.output_area.stderr { |
| background: #fdd; |
| } |
| |
| /* ANSI colors */ |
| .ansi-black-fg { color: #3E424D; } |
| .ansi-black-bg { background-color: #3E424D; } |
| .ansi-black-intense-fg { color: #282C36; } |
| .ansi-black-intense-bg { background-color: #282C36; } |
| .ansi-red-fg { color: #E75C58; } |
| .ansi-red-bg { background-color: #E75C58; } |
| .ansi-red-intense-fg { color: #B22B31; } |
| .ansi-red-intense-bg { background-color: #B22B31; } |
| .ansi-green-fg { color: #00A250; } |
| .ansi-green-bg { background-color: #00A250; } |
| .ansi-green-intense-fg { color: #007427; } |
| .ansi-green-intense-bg { background-color: #007427; } |
| .ansi-yellow-fg { color: #DDB62B; } |
| .ansi-yellow-bg { background-color: #DDB62B; } |
| .ansi-yellow-intense-fg { color: #B27D12; } |
| .ansi-yellow-intense-bg { background-color: #B27D12; } |
| .ansi-blue-fg { color: #208FFB; } |
| .ansi-blue-bg { background-color: #208FFB; } |
| .ansi-blue-intense-fg { color: #0065CA; } |
| .ansi-blue-intense-bg { background-color: #0065CA; } |
| .ansi-magenta-fg { color: #D160C4; } |
| .ansi-magenta-bg { background-color: #D160C4; } |
| .ansi-magenta-intense-fg { color: #A03196; } |
| .ansi-magenta-intense-bg { background-color: #A03196; } |
| .ansi-cyan-fg { color: #60C6C8; } |
| .ansi-cyan-bg { background-color: #60C6C8; } |
| .ansi-cyan-intense-fg { color: #258F8F; } |
| .ansi-cyan-intense-bg { background-color: #258F8F; } |
| .ansi-white-fg { color: #C5C1B4; } |
| .ansi-white-bg { background-color: #C5C1B4; } |
| .ansi-white-intense-fg { color: #A1A6B2; } |
| .ansi-white-intense-bg { background-color: #A1A6B2; } |
| |
| .ansi-default-inverse-fg { color: #FFFFFF; } |
| .ansi-default-inverse-bg { background-color: #000000; } |
| |
| .ansi-bold { font-weight: bold; } |
| .ansi-underline { text-decoration: underline; } |
| |
| |
| div.nbinput.container div.input_area div[class*=highlight] > pre, |
| div.nboutput.container div.output_area div[class*=highlight] > pre, |
| div.nboutput.container div.output_area div[class*=highlight].math, |
| div.nboutput.container div.output_area.rendered_html, |
| div.nboutput.container div.output_area > div.output_javascript, |
| div.nboutput.container div.output_area:not(.rendered_html) > img{ |
| padding: 5px; |
| margin: 0; |
| } |
| |
| /* fix copybtn overflow problem in chromium (needed for 'sphinx_copybutton') */ |
| div.nbinput.container div.input_area > div[class^='highlight'], |
| div.nboutput.container div.output_area > div[class^='highlight']{ |
| overflow-y: hidden; |
| } |
| |
| /* hide copybtn icon on prompts (needed for 'sphinx_copybutton') */ |
| .prompt a.copybtn { |
| display: none; |
| } |
| |
| /* Some additional styling taken form the Jupyter notebook CSS */ |
| div.rendered_html table { |
| border: none; |
| border-collapse: collapse; |
| border-spacing: 0; |
| color: black; |
| font-size: 12px; |
| table-layout: fixed; |
| } |
| div.rendered_html thead { |
| border-bottom: 1px solid black; |
| vertical-align: bottom; |
| } |
| div.rendered_html tr, |
| div.rendered_html th, |
| div.rendered_html td { |
| text-align: right; |
| vertical-align: middle; |
| padding: 0.5em 0.5em; |
| line-height: normal; |
| white-space: normal; |
| max-width: none; |
| border: none; |
| } |
| div.rendered_html th { |
| font-weight: bold; |
| } |
| div.rendered_html tbody tr:nth-child(odd) { |
| background: #f5f5f5; |
| } |
| div.rendered_html tbody tr:hover { |
| background: rgba(66, 165, 245, 0.2); |
| } |
| </style> |
| <div class="section" id="Quickstart:-DataFrame"> |
| <h1>Quickstart: DataFrame<a class="headerlink" href="#Quickstart:-DataFrame" title="Permalink to this headline">ΒΆ</a></h1> |
| <p>This is a short introduction and quickstart for the PySpark DataFrame API. PySpark DataFrames are lazily evaluated. They are implemented on top of <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#overview">RDD</a>s. When Spark <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#transformations">transforms</a> data, it does not immediately compute the transformation but plans how to compute later. When |
| <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#actions">actions</a> such as <code class="docutils literal notranslate"><span class="pre">collect()</span></code> are explicitly called, the computation starts. This notebook shows the basic usages of the DataFrame, geared mainly for new users. You can run the latest version of these examples by yourself in βLive Notebook: DataFrameβ at <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/getting_started/index.html">the quickstart page</a>.</p> |
| <p>There is also other useful information in Apache Spark documentation site, see the latest version of <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL and DataFrames</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html">RDD Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">Structured Streaming Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/streaming-programming-guide.html">Spark Streaming Programming |
| Guide</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/ml-guide.html">Machine Learning Library (MLlib) Guide</a>.</p> |
| <p>PySpark applications start with initializing <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code> which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="DataFrame-Creation"> |
| <h2>DataFrame Creation<a class="headerlink" href="#DataFrame-Creation" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>A PySpark DataFrame can be created via <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> typically by passing a list of lists, tuples, dictionaries and <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code>s, a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> and an RDD consisting of such a list. <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> takes the <code class="docutils literal notranslate"><span class="pre">schema</span></code> argument to specify the schema of the DataFrame. When it is omitted, PySpark infers the corresponding schema by taking a sample from |
| the data.</p> |
| <p>Firstly, you can create a PySpark DataFrame from a list of rows</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datetime</span><span class="w"> </span><span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">date</span> |
| <span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">Row</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">2.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string1'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">3.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string2'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">5.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string3'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">])</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame with an explicit schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="s1">'string1'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">4.</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'a long, b double, c string, d date, e timestamp'</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame from a pandas DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">pandas_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span> |
| <span class="s1">'a'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> |
| <span class="s1">'b'</span><span class="p">:</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">],</span> |
| <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'string1'</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">],</span> |
| <span class="s1">'d'</span><span class="p">:</span> <span class="p">[</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">)],</span> |
| <span class="s1">'e'</span><span class="p">:</span> <span class="p">[</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)]</span> |
| <span class="p">})</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>The DataFrames created above all have the same results and schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="c1"># All DataFrames above result same.</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Viewing-Data"> |
| <h2>Viewing Data<a class="headerlink" href="#Viewing-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>The top rows of a DataFrame can be displayed using <code class="docutils literal notranslate"><span class="pre">DataFrame.show()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>Alternatively, you can enable <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.enabled</span></code> configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter. The number of rows to show can be controlled via <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.maxNumRows</span></code> configuration.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">'spark.sql.repl.eagerEval.enabled'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <table border='1'> |
| <tr><th>a</th><th>b</th><th>c</th><th>d</th><th>e</th></tr> |
| <tr><td>1</td><td>2.0</td><td>string1</td><td>2000-01-01</td><td>2000-01-01 12:00:00</td></tr> |
| <tr><td>2</td><td>3.0</td><td>string2</td><td>2000-02-01</td><td>2000-01-02 12:00:00</td></tr> |
| <tr><td>3</td><td>4.0</td><td>string3</td><td>2000-03-01</td><td>2000-01-03 12:00:00</td></tr> |
| </table></div> |
| </div> |
| <p>The rows can also be shown vertically. This is useful when rows are too long to show horizontally.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">vertical</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| -RECORD 0------------------ |
| a | 1 |
| b | 2.0 |
| c | string1 |
| d | 2000-01-01 |
| e | 2000-01-01 12:00:00 |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>You can see the DataFrameβs schema and column names as follows:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">columns</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| ['a', 'b', 'c', 'd', 'e'] |
| </pre></div></div> |
| </div> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| <p>Show the summary of the DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"a"</span><span class="p">,</span> <span class="s2">"b"</span><span class="p">,</span> <span class="s2">"c"</span><span class="p">)</span><span class="o">.</span><span class="n">describe</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+---+---+-------+ |
| |summary| a| b| c| |
| +-------+---+---+-------+ |
| | count| 3| 3| 3| |
| | mean|2.0|3.0| null| |
| | stddev|1.0|1.0| null| |
| | min| 1|2.0|string1| |
| | max| 3|4.0|string3| |
| +-------+---+---+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p><code class="docutils literal notranslate"><span class="pre">DataFrame.collect()</span></code> collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)), |
| Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)), |
| Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>In order to avoid throwing an out-of-memory exception, use <code class="docutils literal notranslate"><span class="pre">DataFrame.take()</span></code> or <code class="docutils literal notranslate"><span class="pre">DataFrame.tail()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>PySpark DataFrame also provides the conversion back to a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> to leverage pandas API. Note that <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> also collects all data into the driver side that can easily cause an out-of-memory-error when the data is too large to fit into the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <div> |
| <style scoped> |
| .dataframe tbody tr th:only-of-type { |
| vertical-align: middle; |
| } |
| |
| .dataframe tbody tr th { |
| vertical-align: top; |
| } |
| |
| .dataframe thead th { |
| text-align: right; |
| } |
| </style> |
| <table border="1" class="dataframe"> |
| <thead> |
| <tr style="text-align: right;"> |
| <th></th> |
| <th>a</th> |
| <th>b</th> |
| <th>c</th> |
| <th>d</th> |
| <th>e</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <th>0</th> |
| <td>1</td> |
| <td>2.0</td> |
| <td>string1</td> |
| <td>2000-01-01</td> |
| <td>2000-01-01 12:00:00</td> |
| </tr> |
| <tr> |
| <th>1</th> |
| <td>2</td> |
| <td>3.0</td> |
| <td>string2</td> |
| <td>2000-02-01</td> |
| <td>2000-01-02 12:00:00</td> |
| </tr> |
| <tr> |
| <th>2</th> |
| <td>3</td> |
| <td>4.0</td> |
| <td>string3</td> |
| <td>2000-03-01</td> |
| <td>2000-01-03 12:00:00</td> |
| </tr> |
| </tbody> |
| </table> |
| </div></div> |
| </div> |
| </div> |
| <div class="section" id="Selecting-and-Accessing-Data"> |
| <h2>Selecting and Accessing Data<a class="headerlink" href="#Selecting-and-Accessing-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark DataFrame is lazily evaluated and simply selecting a column does not trigger the computation but it returns a <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">a</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| Column<b'a'> |
| </pre></div></div> |
| </div> |
| <p>In fact, most of column-wise operations return <code class="docutils literal notranslate"><span class="pre">Column</span></code>s.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">Column</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">upper</span> |
| |
| <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="o">.</span><span class="n">isNull</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| True |
| </pre></div></div> |
| </div> |
| <p>These <code class="docutils literal notranslate"><span class="pre">Column</span></code>s can be used to select the columns from a DataFrame. For example, <code class="docutils literal notranslate"><span class="pre">DataFrame.select()</span></code> takes the <code class="docutils literal notranslate"><span class="pre">Column</span></code> instances that returns another DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+ |
| | c| |
| +-------+ |
| |string1| |
| |string2| |
| |string3| |
| +-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Assign new <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[19]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">'upper_c'</span><span class="p">,</span> <span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+-------+ |
| | a| b| c| d| e|upper_c| |
| +---+---+-------+----------+-------------------+-------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3| |
| +---+---+-------+----------+-------------------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>To select a subset of rows, use <code class="docutils literal notranslate"><span class="pre">DataFrame.filter()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[20]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Applying-a-Function"> |
| <h2>Applying a Function<a class="headerlink" href="#Applying-a-Function" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark supports various UDFs and APIs to allow users to execute Python native functions. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs">Pandas UDFs</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-function-apis">Pandas Function APIs</a>. For instance, the example below allows users to directly use the APIs in <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html">a pandas |
| Series</a> within Python native function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[21]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">pandas_udf</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="s1">'long'</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">pandas_plus_one</span><span class="p">(</span><span class="n">series</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="c1"># Simply plus one by using pandas Series.</span> |
| <span class="k">return</span> <span class="n">series</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">pandas_plus_one</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +------------------+ |
| |pandas_plus_one(a)| |
| +------------------+ |
| | 2| |
| | 3| |
| | 4| |
| +------------------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Another example is <code class="docutils literal notranslate"><span class="pre">DataFrame.mapInPandas</span></code> which allows users directly use the APIs in a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> without any restrictions such as the result length.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[22]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="k">def</span><span class="w"> </span><span class="nf">pandas_filter_func</span><span class="p">(</span><span class="n">iterator</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">pandas_df</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> |
| <span class="k">yield</span> <span class="n">pandas_df</span><span class="p">[</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="n">pandas_filter_func</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Grouping-Data"> |
| <h2>Grouping Data<a class="headerlink" href="#Grouping-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark DataFrame also provides a way of handling grouped data by using the common approach, split-apply-combine strategy. It groups the data by a certain condition applies a function to each group and then combines them back to the DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[23]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">10</span><span class="p">],</span> <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">20</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">30</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">40</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">50</span><span class="p">],</span> <span class="p">[</span><span class="s1">'black'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">60</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">70</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">80</span><span class="p">]],</span> <span class="n">schema</span><span class="o">=</span><span class="p">[</span><span class="s1">'color'</span><span class="p">,</span> <span class="s1">'fruit'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">])</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Grouping and then applying the <code class="docutils literal notranslate"><span class="pre">avg()</span></code> function to the resulting groups.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[24]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">avg</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+-------+-------+ |
| |color|avg(v1)|avg(v2)| |
| +-----+-------+-------+ |
| | red| 4.8| 48.0| |
| |black| 6.0| 60.0| |
| | blue| 3.0| 30.0| |
| +-----+-------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>You can also apply a Python native function against each group by using pandas API.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[25]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="k">def</span><span class="w"> </span><span class="nf">plus_mean</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">v1</span><span class="o">=</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span> <span class="o">-</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">plus_mean</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| -3| 10| |
| | red|carrot| -1| 30| |
| | red|carrot| 0| 50| |
| | red|banana| 2| 70| |
| | red| grape| 3| 80| |
| |black|carrot| 0| 60| |
| | blue|banana| -1| 20| |
| | blue| grape| 1| 40| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Co-grouping and applying a function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[26]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">))</span> |
| |
| <span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s1">'x'</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'y'</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">))</span> |
| |
| <span class="k">def</span><span class="w"> </span><span class="nf">merge_ordered</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge_ordered</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> |
| |
| <span class="n">df1</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">)</span><span class="o">.</span><span class="n">cogroup</span><span class="p">(</span><span class="n">df2</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">))</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span> |
| <span class="n">merge_ordered</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'time int, id int, v1 double, v2 string'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+---+---+---+ |
| | time| id| v1| v2| |
| +--------+---+---+---+ |
| |20000101| 1|1.0| x| |
| |20000102| 1|3.0| x| |
| |20000101| 2|2.0| y| |
| |20000102| 2|4.0| y| |
| +--------+---+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Getting-Data-In/Out"> |
| <h2>Getting Data In/Out<a class="headerlink" href="#Getting-Data-In/Out" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>CSV is straightforward and easy to use. Parquet and ORC are efficient and compact file formats to read and write faster.</p> |
| <p>There are many other data sources available in PySpark such as JDBC, text, binaryFile, Avro, etc. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL, DataFrames and Datasets Guide</a> in Apache Spark documentation.</p> |
| <div class="section" id="CSV"> |
| <h3>CSV<a class="headerlink" href="#CSV" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[27]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Parquet"> |
| <h3>Parquet<a class="headerlink" href="#Parquet" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[28]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="ORC"> |
| <h3>ORC<a class="headerlink" href="#ORC" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[29]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| </div> |
| <div class="section" id="Working-with-SQL"> |
| <h2>Working with SQL<a class="headerlink" href="#Working-with-SQL" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly. For example, you can register the DataFrame as a table and run a SQL easily as below:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[30]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s2">"tableA"</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT count(*) from tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+ |
| |count(1)| |
| +--------+ |
| | 8| |
| +--------+ |
| |
| </pre></div></div> |
| </div> |
| <p>In addition, UDFs can be registered and invoked in SQL out of the box:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[31]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">"integer"</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">add_one</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">spark</span><span class="o">.</span><span class="n">udf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">"add_one"</span><span class="p">,</span> <span class="n">add_one</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT add_one(v1) FROM tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| </pre></div></div> |
| </div> |
| <p>These SQL expressions can directly be mixed and used as PySpark columns.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[32]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">expr</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s1">'add_one(v1)'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="s1">'count(*)'</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| +--------------+ |
| |(count(1) > 0)| |
| +--------------+ |
| | true| |
| +--------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <!-- Previous / next buttons --> |
| <div class='prev-next-area'> |
| <a class='left-prev' id="prev-link" href="install.html" title="previous page"> |
| <i class="fas fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Installation</p> |
| </div> |
| </a> |
| <a class='right-next' id="next-link" href="quickstart_connect.html" title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Quickstart: Spark Connect</p> |
| </div> |
| <i class="fas fa-angle-right"></i> |
| </a> |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script> |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| |
| <div class="footer-item"> |
| <p class="copyright"> |
| © Copyright .<br> |
| </p> |
| </div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br> |
| </p> |
| </div> |
| |
| </div> |
| </footer> |
| </body> |
| </html> |