| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>Quickstart: DataFrame — PySpark 3.2.2 documentation</title> |
| |
| <link rel="stylesheet" href="../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart: Pandas API on Spark" href="quickstart_ps.html" /> |
| <link rel="prev" title="Installation" href="install.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../index.html"> |
| |
| <img src="../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item active"> |
| <a class="nav-link" href="index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| <li class=""> |
| <a href="install.html">Installation</a> |
| </li> |
| |
| |
| |
| <li class="active"> |
| <a href="">Quickstart: DataFrame</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="quickstart_ps.html">Quickstart: Pandas API on Spark</a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#DataFrame-Creation" class="nav-link">DataFrame Creation</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Viewing-Data" class="nav-link">Viewing Data</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Selecting-and-Accessing-Data" class="nav-link">Selecting and Accessing Data</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Applying-a-Function" class="nav-link">Applying a Function</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Grouping-Data" class="nav-link">Grouping Data</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Getting-Data-in/out" class="nav-link">Getting Data in/out</a><ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#CSV" class="nav-link">CSV</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#Parquet" class="nav-link">Parquet</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h3"> |
| <a href="#ORC" class="nav-link">ORC</a> |
| </li> |
| |
| </ul> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#Working-with-SQL" class="nav-link">Working with SQL</a> |
| </li> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| |
| <style> |
| /* CSS for nbsphinx extension */ |
| |
| /* remove conflicting styling from Sphinx themes */ |
| div.nbinput.container div.prompt *, |
| div.nboutput.container div.prompt *, |
| div.nbinput.container div.input_area pre, |
| div.nboutput.container div.output_area pre, |
| div.nbinput.container div.input_area .highlight, |
| div.nboutput.container div.output_area .highlight { |
| border: none; |
| padding: 0; |
| margin: 0; |
| box-shadow: none; |
| } |
| |
| div.nbinput.container > div[class*=highlight], |
| div.nboutput.container > div[class*=highlight] { |
| margin: 0; |
| } |
| |
| div.nbinput.container div.prompt *, |
| div.nboutput.container div.prompt * { |
| background: none; |
| } |
| |
| div.nboutput.container div.output_area .highlight, |
| div.nboutput.container div.output_area pre { |
| background: unset; |
| } |
| |
| div.nboutput.container div.output_area div.highlight { |
| color: unset; /* override Pygments text color */ |
| } |
| |
| /* avoid gaps between output lines */ |
| div.nboutput.container div[class*=highlight] pre { |
| line-height: normal; |
| } |
| |
| /* input/output containers */ |
| div.nbinput.container, |
| div.nboutput.container { |
| display: -webkit-flex; |
| display: flex; |
| align-items: flex-start; |
| margin: 0; |
| width: 100%; |
| } |
| @media (max-width: 540px) { |
| div.nbinput.container, |
| div.nboutput.container { |
| flex-direction: column; |
| } |
| } |
| |
| /* input container */ |
| div.nbinput.container { |
| padding-top: 5px; |
| } |
| |
| /* last container */ |
| div.nblast.container { |
| padding-bottom: 5px; |
| } |
| |
| /* input prompt */ |
| div.nbinput.container div.prompt pre { |
| color: #307FC1; |
| } |
| |
| /* output prompt */ |
| div.nboutput.container div.prompt pre { |
| color: #BF5B3D; |
| } |
| |
| /* all prompts */ |
| div.nbinput.container div.prompt, |
| div.nboutput.container div.prompt { |
| width: 4.5ex; |
| padding-top: 5px; |
| position: relative; |
| user-select: none; |
| } |
| |
| div.nbinput.container div.prompt > div, |
| div.nboutput.container div.prompt > div { |
| position: absolute; |
| right: 0; |
| margin-right: 0.3ex; |
| } |
| |
| @media (max-width: 540px) { |
| div.nbinput.container div.prompt, |
| div.nboutput.container div.prompt { |
| width: unset; |
| text-align: left; |
| padding: 0.4em; |
| } |
| div.nboutput.container div.prompt.empty { |
| padding: 0; |
| } |
| |
| div.nbinput.container div.prompt > div, |
| div.nboutput.container div.prompt > div { |
| position: unset; |
| } |
| } |
| |
| /* disable scrollbars on prompts */ |
| div.nbinput.container div.prompt pre, |
| div.nboutput.container div.prompt pre { |
| overflow: hidden; |
| } |
| |
| /* input/output area */ |
| div.nbinput.container div.input_area, |
| div.nboutput.container div.output_area { |
| -webkit-flex: 1; |
| flex: 1; |
| overflow: auto; |
| } |
| @media (max-width: 540px) { |
| div.nbinput.container div.input_area, |
| div.nboutput.container div.output_area { |
| width: 100%; |
| } |
| } |
| |
| /* input area */ |
| div.nbinput.container div.input_area { |
| border: 1px solid #e0e0e0; |
| border-radius: 2px; |
| /*background: #f5f5f5;*/ |
| } |
| |
| /* override MathJax center alignment in output cells */ |
| div.nboutput.container div[class*=MathJax] { |
| text-align: left !important; |
| } |
| |
| /* override sphinx.ext.imgmath center alignment in output cells */ |
| div.nboutput.container div.math p { |
| text-align: left; |
| } |
| |
| /* standard error */ |
| div.nboutput.container div.output_area.stderr { |
| background: #fdd; |
| } |
| |
| /* ANSI colors */ |
| .ansi-black-fg { color: #3E424D; } |
| .ansi-black-bg { background-color: #3E424D; } |
| .ansi-black-intense-fg { color: #282C36; } |
| .ansi-black-intense-bg { background-color: #282C36; } |
| .ansi-red-fg { color: #E75C58; } |
| .ansi-red-bg { background-color: #E75C58; } |
| .ansi-red-intense-fg { color: #B22B31; } |
| .ansi-red-intense-bg { background-color: #B22B31; } |
| .ansi-green-fg { color: #00A250; } |
| .ansi-green-bg { background-color: #00A250; } |
| .ansi-green-intense-fg { color: #007427; } |
| .ansi-green-intense-bg { background-color: #007427; } |
| .ansi-yellow-fg { color: #DDB62B; } |
| .ansi-yellow-bg { background-color: #DDB62B; } |
| .ansi-yellow-intense-fg { color: #B27D12; } |
| .ansi-yellow-intense-bg { background-color: #B27D12; } |
| .ansi-blue-fg { color: #208FFB; } |
| .ansi-blue-bg { background-color: #208FFB; } |
| .ansi-blue-intense-fg { color: #0065CA; } |
| .ansi-blue-intense-bg { background-color: #0065CA; } |
| .ansi-magenta-fg { color: #D160C4; } |
| .ansi-magenta-bg { background-color: #D160C4; } |
| .ansi-magenta-intense-fg { color: #A03196; } |
| .ansi-magenta-intense-bg { background-color: #A03196; } |
| .ansi-cyan-fg { color: #60C6C8; } |
| .ansi-cyan-bg { background-color: #60C6C8; } |
| .ansi-cyan-intense-fg { color: #258F8F; } |
| .ansi-cyan-intense-bg { background-color: #258F8F; } |
| .ansi-white-fg { color: #C5C1B4; } |
| .ansi-white-bg { background-color: #C5C1B4; } |
| .ansi-white-intense-fg { color: #A1A6B2; } |
| .ansi-white-intense-bg { background-color: #A1A6B2; } |
| |
| .ansi-default-inverse-fg { color: #FFFFFF; } |
| .ansi-default-inverse-bg { background-color: #000000; } |
| |
| .ansi-bold { font-weight: bold; } |
| .ansi-underline { text-decoration: underline; } |
| |
| |
| div.nbinput.container div.input_area div[class*=highlight] > pre, |
| div.nboutput.container div.output_area div[class*=highlight] > pre, |
| div.nboutput.container div.output_area div[class*=highlight].math, |
| div.nboutput.container div.output_area.rendered_html, |
| div.nboutput.container div.output_area > div.output_javascript, |
| div.nboutput.container div.output_area:not(.rendered_html) > img{ |
| padding: 5px; |
| margin: 0; |
| } |
| |
| /* fix copybtn overflow problem in chromium (needed for 'sphinx_copybutton') */ |
| div.nbinput.container div.input_area > div[class^='highlight'], |
| div.nboutput.container div.output_area > div[class^='highlight']{ |
| overflow-y: hidden; |
| } |
| |
| /* hide copybtn icon on prompts (needed for 'sphinx_copybutton') */ |
| .prompt a.copybtn { |
| display: none; |
| } |
| |
| /* Some additional styling taken form the Jupyter notebook CSS */ |
| div.rendered_html table { |
| border: none; |
| border-collapse: collapse; |
| border-spacing: 0; |
| color: black; |
| font-size: 12px; |
| table-layout: fixed; |
| } |
| div.rendered_html thead { |
| border-bottom: 1px solid black; |
| vertical-align: bottom; |
| } |
| div.rendered_html tr, |
| div.rendered_html th, |
| div.rendered_html td { |
| text-align: right; |
| vertical-align: middle; |
| padding: 0.5em 0.5em; |
| line-height: normal; |
| white-space: normal; |
| max-width: none; |
| border: none; |
| } |
| div.rendered_html th { |
| font-weight: bold; |
| } |
| div.rendered_html tbody tr:nth-child(odd) { |
| background: #f5f5f5; |
| } |
| div.rendered_html tbody tr:hover { |
| background: rgba(66, 165, 245, 0.2); |
| } |
| </style> |
| <div class="section" id="Quickstart:-DataFrame"> |
| <h1>Quickstart: DataFrame<a class="headerlink" href="#Quickstart:-DataFrame" title="Permalink to this headline">ΒΆ</a></h1> |
| <p>This is a short introduction and quickstart for the PySpark DataFrame API. PySpark DataFrames are lazily evaluated. They are implemented on top of <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#overview">RDD</a>s. When Spark <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#transformations">transforms</a> data, it does not immediately compute the transformation but plans how to compute later. When |
| <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html#actions">actions</a> such as <code class="docutils literal notranslate"><span class="pre">collect()</span></code> are explicitly called, the computation starts. This notebook shows the basic usages of the DataFrame, geared mainly for new users. You can run the latest version of these examples by yourself in βLive Notebook: DataFrameβ at <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/getting_started/index.html">the quickstart page</a>.</p> |
| <p>There is also other useful information in Apache Spark documentation site, see the latest version of <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL and DataFrames</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/rdd-programming-guide.html">RDD Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">Structured Streaming Programming Guide</a>, <a class="reference external" href="https://spark.apache.org/docs/latest/streaming-programming-guide.html">Spark Streaming Programming |
| Guide</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/ml-guide.html">Machine Learning Library (MLlib) Guide</a>.</p> |
| <p>PySpark applications start with initializing <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code> which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="DataFrame-Creation"> |
| <h2>DataFrame Creation<a class="headerlink" href="#DataFrame-Creation" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>A PySpark DataFrame can be created via <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> typically by passing a list of lists, tuples, dictionaries and <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code>s, a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> and an RDD consisting of such a list. <code class="docutils literal notranslate"><span class="pre">pyspark.sql.SparkSession.createDataFrame</span></code> takes the <code class="docutils literal notranslate"><span class="pre">schema</span></code> argument to specify the schema of the DataFrame. When it is omitted, PySpark infers the corresponding schema by taking a sample from |
| the data.</p> |
| <p>Firstly, you can create a PySpark DataFrame from a list of rows</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">date</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">2.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string1'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">3.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string2'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">a</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">b</span><span class="o">=</span><span class="mf">5.</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">'string3'</span><span class="p">,</span> <span class="n">d</span><span class="o">=</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">e</span><span class="o">=</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">])</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame with an explicit schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="s1">'string1'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">4.</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'a long, b double, c string, d date, e timestamp'</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame from a pandas DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">pandas_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span> |
| <span class="s1">'a'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> |
| <span class="s1">'b'</span><span class="p">:</span> <span class="p">[</span><span class="mf">2.</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="mf">4.</span><span class="p">],</span> |
| <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'string1'</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">],</span> |
| <span class="s1">'d'</span><span class="p">:</span> <span class="p">[</span><span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">)],</span> |
| <span class="s1">'e'</span><span class="p">:</span> <span class="p">[</span><span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)]</span> |
| <span class="p">})</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>Create a PySpark DataFrame from an RDD consisting of a list of tuples.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">rdd</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.</span><span class="p">,</span> <span class="s1">'string1'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">4.</span><span class="p">,</span> <span class="s1">'string3'</span><span class="p">,</span> <span class="n">date</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">datetime</span><span class="p">(</span><span class="mi">2000</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span> |
| <span class="p">])</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="p">[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">,</span> <span class="s1">'c'</span><span class="p">,</span> <span class="s1">'d'</span><span class="p">,</span> <span class="s1">'e'</span><span class="p">])</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp] |
| </pre></div></div> |
| </div> |
| <p>The DataFrames created above all have the same results and schema.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="c1"># All DataFrames above result same.</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Viewing-Data"> |
| <h2>Viewing Data<a class="headerlink" href="#Viewing-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>The top rows of a DataFrame can be displayed using <code class="docutils literal notranslate"><span class="pre">DataFrame.show()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>Alternatively, you can enable <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.enabled</span></code> configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter. The number of rows to show can be controlled via <code class="docutils literal notranslate"><span class="pre">spark.sql.repl.eagerEval.maxNumRows</span></code> configuration.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">'spark.sql.repl.eagerEval.enabled'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="n">df</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <table border='1'> |
| <tr><th>a</th><th>b</th><th>c</th><th>d</th><th>e</th></tr> |
| <tr><td>1</td><td>2.0</td><td>string1</td><td>2000-01-01</td><td>2000-01-01 12:00:00</td></tr> |
| <tr><td>2</td><td>3.0</td><td>string2</td><td>2000-02-01</td><td>2000-01-02 12:00:00</td></tr> |
| <tr><td>3</td><td>4.0</td><td>string3</td><td>2000-03-01</td><td>2000-01-03 12:00:00</td></tr> |
| </table></div> |
| </div> |
| <p>The rows can also be shown vertically. This is useful when rows are too long to show horizontally.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">vertical</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| -RECORD 0------------------ |
| a | 1 |
| b | 2.0 |
| c | string1 |
| d | 2000-01-01 |
| e | 2000-01-01 12:00:00 |
| only showing top 1 row |
| |
| </pre></div></div> |
| </div> |
| <p>You can see the DataFrameβs schema and column names as follows:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">columns</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| ['a', 'b', 'c', 'd', 'e'] |
| </pre></div></div> |
| </div> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| root |
| |-- a: long (nullable = true) |
| |-- b: double (nullable = true) |
| |-- c: string (nullable = true) |
| |-- d: date (nullable = true) |
| |-- e: timestamp (nullable = true) |
| |
| </pre></div></div> |
| </div> |
| <p>Show the summary of the DataFrame</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"a"</span><span class="p">,</span> <span class="s2">"b"</span><span class="p">,</span> <span class="s2">"c"</span><span class="p">)</span><span class="o">.</span><span class="n">describe</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+---+---+-------+ |
| |summary| a| b| c| |
| +-------+---+---+-------+ |
| | count| 3| 3| 3| |
| | mean|2.0|3.0| null| |
| | stddev|1.0|1.0| null| |
| | min| 1|2.0|string1| |
| | max| 3|4.0|string3| |
| +-------+---+---+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p><code class="docutils literal notranslate"><span class="pre">DataFrame.collect()</span></code> collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)), |
| Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)), |
| Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>In order to avoid throwing an out-of-memory exception, use <code class="docutils literal notranslate"><span class="pre">DataFrame.take()</span></code> or <code class="docutils literal notranslate"><span class="pre">DataFrame.tail()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| [Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))] |
| </pre></div></div> |
| </div> |
| <p>PySpark DataFrame also provides the conversion back to a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> to leverage pandas API. Note that <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> also collects all data into the driver side that can easily cause an out-of-memory-error when the data is too large to fit into the driver side.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="output_area rendered_html docutils container"> |
| <div> |
| <style scoped> |
| .dataframe tbody tr th:only-of-type { |
| vertical-align: middle; |
| } |
| |
| .dataframe tbody tr th { |
| vertical-align: top; |
| } |
| |
| .dataframe thead th { |
| text-align: right; |
| } |
| </style> |
| <table border="1" class="dataframe"> |
| <thead> |
| <tr style="text-align: right;"> |
| <th></th> |
| <th>a</th> |
| <th>b</th> |
| <th>c</th> |
| <th>d</th> |
| <th>e</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <th>0</th> |
| <td>1</td> |
| <td>2.0</td> |
| <td>string1</td> |
| <td>2000-01-01</td> |
| <td>2000-01-01 12:00:00</td> |
| </tr> |
| <tr> |
| <th>1</th> |
| <td>2</td> |
| <td>3.0</td> |
| <td>string2</td> |
| <td>2000-02-01</td> |
| <td>2000-01-02 12:00:00</td> |
| </tr> |
| <tr> |
| <th>2</th> |
| <td>3</td> |
| <td>4.0</td> |
| <td>string3</td> |
| <td>2000-03-01</td> |
| <td>2000-01-03 12:00:00</td> |
| </tr> |
| </tbody> |
| </table> |
| </div></div> |
| </div> |
| </div> |
| <div class="section" id="Selecting-and-Accessing-Data"> |
| <h2>Selecting and Accessing Data<a class="headerlink" href="#Selecting-and-Accessing-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark DataFrame is lazily evaluated and simply selecting a column does not trigger the computation but it returns a <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">a</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| Column<b'a'> |
| </pre></div></div> |
| </div> |
| <p>In fact, most of column-wise operations return <code class="docutils literal notranslate"><span class="pre">Column</span></code>s.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">upper</span> |
| |
| <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="o">.</span><span class="n">isNull</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[17]: |
| </pre></div> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| True |
| </pre></div></div> |
| </div> |
| <p>These <code class="docutils literal notranslate"><span class="pre">Column</span></code>s can be used to select the columns from a DataFrame. For example, <code class="docutils literal notranslate"><span class="pre">DataFrame.select()</span></code> takes the <code class="docutils literal notranslate"><span class="pre">Column</span></code> instances that returns another DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+ |
| | c| |
| +-------+ |
| |string1| |
| |string2| |
| |string3| |
| +-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Assign new <code class="docutils literal notranslate"><span class="pre">Column</span></code> instance.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[19]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">'upper_c'</span><span class="p">,</span> <span class="n">upper</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">c</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+-------+ |
| | a| b| c| d| e|upper_c| |
| +---+---+-------+----------+-------------------+-------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1| |
| | 2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2| |
| | 3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3| |
| +---+---+-------+----------+-------------------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>To select a subset of rows, use <code class="docutils literal notranslate"><span class="pre">DataFrame.filter()</span></code>.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[20]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Applying-a-Function"> |
| <h2>Applying a Function<a class="headerlink" href="#Applying-a-Function" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark supports various UDFs and APIs to allow users to execute Python native functions. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs">Pandas UDFs</a> and <a class="reference external" href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-function-apis">Pandas Function APIs</a>. For instance, the example below allows users to directly use the APIs in <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html">a pandas |
| Series</a> within Python native function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[21]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">import</span> <span class="nn">pandas</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="s1">'long'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">pandas_plus_one</span><span class="p">(</span><span class="n">series</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="c1"># Simply plus one by using pandas Series.</span> |
| <span class="k">return</span> <span class="n">series</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">pandas_plus_one</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">a</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +------------------+ |
| |pandas_plus_one(a)| |
| +------------------+ |
| | 2| |
| | 3| |
| | 4| |
| +------------------+ |
| |
| </pre></div></div> |
| </div> |
| <p>Another example is <code class="docutils literal notranslate"><span class="pre">DataFrame.mapInPandas</span></code> which allows users directly use the APIs in a <a class="reference external" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html">pandas DataFrame</a> without any restrictions such as the result length.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[22]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="k">def</span> <span class="nf">pandas_filter_func</span><span class="p">(</span><span class="n">iterator</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">pandas_df</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span> |
| <span class="k">yield</span> <span class="n">pandas_df</span><span class="p">[</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">a</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="n">pandas_filter_func</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+---+-------+----------+-------------------+ |
| | a| b| c| d| e| |
| +---+---+-------+----------+-------------------+ |
| | 1|2.0|string1|2000-01-01|2000-01-01 12:00:00| |
| +---+---+-------+----------+-------------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Grouping-Data"> |
| <h2>Grouping Data<a class="headerlink" href="#Grouping-Data" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>PySpark DataFrame also provides a way of handling grouped data by using the common approach, split-apply-combine strategy. It groups the data by a certain condition applies a function to each group and then combines them back to the DataFrame.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[23]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">10</span><span class="p">],</span> <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">20</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">30</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'blue'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">40</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">50</span><span class="p">],</span> <span class="p">[</span><span class="s1">'black'</span><span class="p">,</span> <span class="s1">'carrot'</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">60</span><span class="p">],</span> |
| <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'banana'</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">70</span><span class="p">],</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'grape'</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">80</span><span class="p">]],</span> <span class="n">schema</span><span class="o">=</span><span class="p">[</span><span class="s1">'color'</span><span class="p">,</span> <span class="s1">'fruit'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">])</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Grouping and then applying the <code class="docutils literal notranslate"><span class="pre">avg()</span></code> function to the resulting groups.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[24]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">avg</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+-------+-------+ |
| |color|avg(v1)|avg(v2)| |
| +-----+-------+-------+ |
| | red| 4.8| 48.0| |
| |black| 6.0| 60.0| |
| | blue| 3.0| 30.0| |
| +-----+-------+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>You can also apply a Python native function against each group by using pandas API.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[25]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="k">def</span> <span class="nf">plus_mean</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">v1</span><span class="o">=</span><span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span> <span class="o">-</span> <span class="n">pandas_df</span><span class="o">.</span><span class="n">v1</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'color'</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">plus_mean</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| -3| 10| |
| | red|carrot| -1| 30| |
| | red|carrot| 0| 50| |
| | red|banana| 2| 70| |
| | red| grape| 3| 80| |
| |black|carrot| 0| 60| |
| | blue|banana| -1| 20| |
| | blue| grape| 1| 40| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| <p>Co-grouping and applying a function.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[26]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df1</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000102</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v1'</span><span class="p">))</span> |
| |
| <span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s1">'x'</span><span class="p">),</span> <span class="p">(</span><span class="mi">20000101</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'y'</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s1">'time'</span><span class="p">,</span> <span class="s1">'id'</span><span class="p">,</span> <span class="s1">'v2'</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">asof_join</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge_asof</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="n">r</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">'time'</span><span class="p">,</span> <span class="n">by</span><span class="o">=</span><span class="s1">'id'</span><span class="p">)</span> |
| |
| <span class="n">df1</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">)</span><span class="o">.</span><span class="n">cogroup</span><span class="p">(</span><span class="n">df2</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">'id'</span><span class="p">))</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span> |
| <span class="n">asof_join</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="s1">'time int, id int, v1 double, v2 string'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+---+---+---+ |
| | time| id| v1| v2| |
| +--------+---+---+---+ |
| |20000101| 1|1.0| x| |
| |20000102| 1|3.0| x| |
| |20000101| 2|2.0| y| |
| |20000102| 2|4.0| y| |
| +--------+---+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Getting-Data-in/out"> |
| <h2>Getting Data in/out<a class="headerlink" href="#Getting-Data-in/out" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>CSV is straightforward and easy to use. Parquet and ORC are efficient and compact file formats to read and write faster.</p> |
| <p>There are many other data sources available in PySpark such as JDBC, text, binaryFile, Avro, etc. See also the latest <a class="reference external" href="https://spark.apache.org/docs/latest/sql-programming-guide.html">Spark SQL, DataFrames and Datasets Guide</a> in Apache Spark documentation.</p> |
| <div class="section" id="CSV"> |
| <h3>CSV<a class="headerlink" href="#CSV" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[27]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'foo.csv'</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="Parquet"> |
| <h3>Parquet<a class="headerlink" href="#Parquet" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[28]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s1">'bar.parquet'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| <div class="section" id="ORC"> |
| <h3>ORC<a class="headerlink" href="#ORC" title="Permalink to this headline">ΒΆ</a></h3> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[29]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">orc</span><span class="p">(</span><span class="s1">'zoo.orc'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+---+---+ |
| |color| fruit| v1| v2| |
| +-----+------+---+---+ |
| | red|banana| 1| 10| |
| | blue|banana| 2| 20| |
| | red|carrot| 3| 30| |
| | blue| grape| 4| 40| |
| | red|carrot| 5| 50| |
| |black|carrot| 6| 60| |
| | red|banana| 7| 70| |
| | red| grape| 8| 80| |
| +-----+------+---+---+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| </div> |
| <div class="section" id="Working-with-SQL"> |
| <h2>Working with SQL<a class="headerlink" href="#Working-with-SQL" title="Permalink to this headline">ΒΆ</a></h2> |
| <p>DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly. For example, you can register the DataFrame as a table and run a SQL easily as below:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[30]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="n">df</span><span class="o">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s2">"tableA"</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT count(*) from tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------+ |
| |count(1)| |
| +--------+ |
| | 8| |
| +--------+ |
| |
| </pre></div></div> |
| </div> |
| <p>In addition, UDFs can be registered and invoked in SQL out of the box:</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[31]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">"integer"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">add_one</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span> <span class="o">+</span> <span class="mi">1</span> |
| |
| <span class="n">spark</span><span class="o">.</span><span class="n">udf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">"add_one"</span><span class="p">,</span> <span class="n">add_one</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT add_one(v1) FROM tableA"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| </pre></div></div> |
| </div> |
| <p>These SQL expressions can directly be mixed and used as PySpark columns.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[32]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre> |
| <span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">expr</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s1">'add_one(v1)'</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="s1">'count(*)'</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----------+ |
| |add_one(v1)| |
| +-----------+ |
| | 2| |
| | 3| |
| | 4| |
| | 5| |
| | 6| |
| | 7| |
| | 8| |
| | 9| |
| +-----------+ |
| |
| +--------------+ |
| |(count(1) > 0)| |
| +--------------+ |
| | true| |
| +--------------+ |
| |
| </pre></div></div> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| <a class='left-prev' id="prev-link" href="install.html" title="previous page">Installation</a> |
| <a class='right-next' id="next-link" href="quickstart_ps.html" title="next page">Quickstart: Pandas API on Spark</a> |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |