blob: 6c8c1cc1387062afad761ef304ad8ddcd3f39977 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.DataFrame.assign &#8212; PySpark 3.4.3 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.assign.html" />
<link rel="search" title="Search" href="../../../search.html" />
<link rel="next" title="pyspark.pandas.DataFrame.merge" href="pyspark.pandas.DataFrame.merge.html" />
<link rel="prev" title="pyspark.pandas.DataFrame.append" href="pyspark.pandas.DataFrame.append.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../index.html">Overview</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guides</a>
</li>
<li class="nav-item active">
<a class="nav-link" href="../../index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guides</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
<li class="">
<a href="../../pyspark.sql/index.html">Spark SQL</a>
</li>
<li class="active">
<a href="../index.html">Pandas API on Spark</a>
<ul>
<li class="">
<a href="../io.html">Input/Output</a>
</li>
<li class="">
<a href="../general_functions.html">General functions</a>
</li>
<li class="">
<a href="../series.html">Series</a>
</li>
<li class="active">
<a href="../frame.html">DataFrame</a>
</li>
<li class="">
<a href="../indexing.html">Index objects</a>
</li>
<li class="">
<a href="../window.html">Window</a>
</li>
<li class="">
<a href="../groupby.html">GroupBy</a>
</li>
<li class="">
<a href="../resampling.html">Resampling</a>
</li>
<li class="">
<a href="../ml.html">Machine Learning utilities</a>
</li>
<li class="">
<a href="../extensions.html">Extensions</a>
</li>
</ul>
</li>
<li class="">
<a href="../../pyspark.ss/index.html">Structured Streaming</a>
</li>
<li class="">
<a href="../../pyspark.ml.html">MLlib (DataFrame-based)</a>
</li>
<li class="">
<a href="../../pyspark.streaming.html">Spark Streaming (Legacy)</a>
</li>
<li class="">
<a href="../../pyspark.mllib.html">MLlib (RDD-based)</a>
</li>
<li class="">
<a href="../../pyspark.html">Spark Core</a>
</li>
<li class="">
<a href="../../pyspark.resource.html">Resource Management</a>
</li>
<li class="">
<a href="../../pyspark.errors.html">Errors</a>
</li>
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="pyspark-pandas-dataframe-assign">
<h1>pyspark.pandas.DataFrame.assign<a class="headerlink" href="#pyspark-pandas-dataframe-assign" title="Permalink to this headline"></a></h1>
<dl class="py method">
<dt id="pyspark.pandas.DataFrame.assign">
<code class="sig-prename descclassname">DataFrame.</code><code class="sig-name descname">assign</code><span class="sig-paren">(</span><em class="sig-param"><span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span></em><span class="sig-paren">)</span> &#x2192; pyspark.pandas.frame.DataFrame<a class="reference internal" href="../../../_modules/pyspark/pandas/frame.html#DataFrame.assign"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.pandas.DataFrame.assign" title="Permalink to this definition"></a></dt>
<dd><p>Assign new columns to a DataFrame.</p>
<p>Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl class="simple">
<dt><strong>**kwargs</strong><span class="classifier">dict of {str: callable, Series or Index}</span></dt><dd><p>The column names are keywords. If the values are
callable, they are computed on the DataFrame and
assigned to the new columns. The callable must not
change input DataFrame (though pandas-on-Spark doesn’t check it).
If the values are not callable, (e.g. a Series or a literal),
they are simply assigned.</p>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl class="simple">
<dt>DataFrame</dt><dd><p>A new DataFrame with the new columns in addition to
all the existing columns.</p>
</dd>
</dl>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>Assigning multiple columns within the same <code class="docutils literal notranslate"><span class="pre">assign</span></code> is possible
but you cannot refer to newly created or modified columns. This
feature is supported in pandas for Python 3.6 and later but not in
pandas-on-Spark. In pandas-on-Spark, all items are computed first,
and then assigned.</p>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;temp_c&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mf">17.0</span><span class="p">,</span> <span class="mf">25.0</span><span class="p">]},</span>
<span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Portland&#39;</span><span class="p">,</span> <span class="s1">&#39;Berkeley&#39;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span>
<span class="go"> temp_c</span>
<span class="go">Portland 17.0</span>
<span class="go">Berkeley 25.0</span>
</pre></div>
</div>
<p>Where the value is a callable, evaluated on <cite>df</cite>:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">temp_f</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">temp_c</span> <span class="o">*</span> <span class="mi">9</span> <span class="o">/</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">32</span><span class="p">)</span>
<span class="go"> temp_c temp_f</span>
<span class="go">Portland 17.0 62.6</span>
<span class="go">Berkeley 25.0 77.0</span>
</pre></div>
</div>
<p>Alternatively, the same behavior can be achieved by directly
referencing an existing Series or sequence and you can also
create multiple columns within the same assign.</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">assigned</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">temp_f</span><span class="o">=</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;temp_c&#39;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">9</span> <span class="o">/</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">32</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">temp_k</span><span class="o">=</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;temp_c&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="mf">273.15</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">temp_idx</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">assigned</span><span class="p">[[</span><span class="s1">&#39;temp_c&#39;</span><span class="p">,</span> <span class="s1">&#39;temp_f&#39;</span><span class="p">,</span> <span class="s1">&#39;temp_k&#39;</span><span class="p">,</span> <span class="s1">&#39;temp_idx&#39;</span><span class="p">]]</span>
<span class="go"> temp_c temp_f temp_k temp_idx</span>
<span class="go">Portland 17.0 62.6 290.15 Portland</span>
<span class="go">Berkeley 25.0 77.0 298.15 Berkeley</span>
</pre></div>
</div>
</dd></dl>
</div>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="pyspark.pandas.DataFrame.append.html" title="previous page">pyspark.pandas.DataFrame.append</a>
<a class='right-next' id="next-link" href="pyspark.pandas.DataFrame.merge.html" title="next page">pyspark.pandas.DataFrame.merge</a>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>